diff --git a/.gitignore b/.gitignore index 831ba58..4d8ad3c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,8 @@ /**/*~ !boot_code/crt0.S /simplified-runtime.xml +**.log +**.ini +**.wlf +**transcript +**.dbg \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..5ade099 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "hwpe/neureka/pulp-nnx"] + path = hwpe/neureka/pulp-nnx + url = https://github.com/pulp-platform/pulp-nnx diff --git a/astral.yaml b/astral.yaml new file mode 100644 index 0000000..d8acd6c --- /dev/null +++ b/astral.yaml @@ -0,0 +1,19 @@ +astral: + parMatrixMul8: + path: ./astral/parMatrixMul8 + command: make clean all run + parMatrixMul16: + path: ./astral/parMatrixMul16 + command: make clean all run + parMatrixMul32: + path: ./astral/parMatrixMul32 + command: make clean all run + dmr_matmul: + path: ./astral/dmr_matmul + command: make clean all run + redmule: + path: ./astral/redmule + command: make clean all run + neureka: + path: ./astral/neureka + command: make clean all run \ No newline at end of file diff --git a/astral/dmr_matmul b/astral/dmr_matmul new file mode 120000 index 0000000..c6cc223 --- /dev/null +++ b/astral/dmr_matmul @@ -0,0 +1 @@ +../reliability_tests/dmr_matmul \ No newline at end of file diff --git a/astral/ecc_test b/astral/ecc_test new file mode 120000 index 0000000..a80e9ae --- /dev/null +++ b/astral/ecc_test @@ -0,0 +1 @@ +../reliability_tests/ecc_test \ No newline at end of file diff --git a/astral/hello/Makefile b/astral/hello/Makefile new file mode 100644 index 0000000..d145a0f --- /dev/null +++ b/astral/hello/Makefile @@ -0,0 +1,5 @@ +PULP_APP = test +PULP_APP_SRCS = hello.c +PULP_CFLAGS = -O3 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/astral/hello/hello.c b/astral/hello/hello.c new file mode 100644 index 0000000..8741206 --- /dev/null +++ b/astral/hello/hello.c @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +int main() +{ + printf("Hello !\n"); + + return 0; +} diff --git a/astral/icache_fi_conv16 b/astral/icache_fi_conv16 new file mode 120000 index 0000000..c97475b --- /dev/null +++ b/astral/icache_fi_conv16 @@ -0,0 +1 @@ +../reliability_tests/icache_fi_conv16 \ No newline at end of file diff --git a/astral/neureka b/astral/neureka new file mode 120000 index 0000000..98cb979 --- /dev/null +++ b/astral/neureka @@ -0,0 +1 @@ +../hwpe/neureka/ \ No newline at end of file diff --git a/astral/parMatrixMul16/Makefile b/astral/parMatrixMul16/Makefile new file mode 100755 index 0000000..0fade4a --- /dev/null +++ b/astral/parMatrixMul16/Makefile @@ -0,0 +1,8 @@ +PULP_APP = test +PULP_APP_SRCS = matrixMul.c + +PULP_CFLAGS = -O3 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk + +#pulp-bench-reg --name=parMatrixMul16.cycles --module=pulp_rtl_testset --pipeline=$(PIPELINE) --artefact=pulp_rtl_testset --cmd="make run -f Makefile.sdk" --probe-regexp='matrixMul -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(16)" --probe-regexp='matrixMulTranspose -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(16),transposed" diff --git a/astral/parMatrixMul16/gen_stimuli.py b/astral/parMatrixMul16/gen_stimuli.py new file mode 100755 index 0000000..754a8a9 --- /dev/null +++ b/astral/parMatrixMul16/gen_stimuli.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import sys +import random + + +def write_arr(f, name, arr): + f.write('const short %s[] = {\n' % name) + for v in arr: + f.write('%d,\n' % (v)) + f.write('};\n\n') + return + +################################################################################ +f = open('parMatrixMul16_stimuli.h', 'w') + + +SIZE = 24 +RANGE = int(2**7/SIZE) + +m_a = [] +m_b = [] +m_exp = [] + +for i in range(0,SIZE): + for j in range(0,SIZE): + a = random.randint(-RANGE, RANGE-1) + b = random.randint(-RANGE, RANGE-1) + + m_a.append(a) + m_b.append(b) + +for i in range(0,SIZE): + for j in range(0,SIZE): + r = 0 + + for k in range (0,SIZE): + r = r + m_a[i * SIZE + k] * m_b[k * SIZE + j] + + m_exp.append(r) + + +write_arr(f, 'm_a', m_a) +write_arr(f, 'm_b', m_b) +write_arr(f, 'm_exp', m_exp) + +f.write('#define SIZE %d\n' % SIZE) + + +f.write('__attribute__ ((section(".heapsram"))) short g_mA[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) short g_mB[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) short g_mC[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) short g_mB_tmp[SIZE][SIZE];\n') + diff --git a/astral/parMatrixMul16/matrixMul.c b/astral/parMatrixMul16/matrixMul.c new file mode 100644 index 0000000..5cbe65f --- /dev/null +++ b/astral/parMatrixMul16/matrixMul.c @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Mantainer: Luca Valente, luca.valente2@unibo.it + */ + +#include "pulp.h" + +#include "parMatrixMul16_stimuli.h" + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()); +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()); + +testcase_t testcases[] = { + { .name = "matrixMul", .test = check_matrix_mul }, + { .name = "matrixMulTranspose", .test = check_matrix_mul_transpose }, + {0, 0} +}; + +unsigned int num_cores; + +int main() +{ + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + num_cores = get_core_num(); + + if(rt_core_id() < num_cores) { + run_suite(testcases); + } + + synch_barrier(); + + return 0; +} + +void matrix_init(); +unsigned int matrix_check(); + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB[k][j]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + // transpose array before using it + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mB_tmp[i][j] = g_mB[j][i]; + } + } + + if(num_cores != 1) synch_barrier(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB_tmp[j][k]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void matrix_init() { + unsigned int i, j; + + // init, copy to TCDM + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + g_mA[i][j] = m_a[i * SIZE + j]; + g_mB[i][j] = m_b[i * SIZE + j]; + g_mC[i][j] = 0; + } + } +} + +unsigned int matrix_check() { + unsigned int errors = 0; + unsigned int i, j; + // check + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + if(g_mC[i][j] != m_exp[i * SIZE + j]) { + printf("At index %d, %d\n", i, j, 0, 0); + errors++; + } + } + } + + return errors; +} diff --git a/astral/parMatrixMul16/parMatrixMul16_stimuli.h b/astral/parMatrixMul16/parMatrixMul16_stimuli.h new file mode 100644 index 0000000..2797565 --- /dev/null +++ b/astral/parMatrixMul16/parMatrixMul16_stimuli.h @@ -0,0 +1,1742 @@ +const short m_a[] = { +3, +2, +-4, +0, +0, +0, +-5, +-3, +2, +4, +0, +2, +2, +-3, +2, +-1, +-2, +-1, +1, +4, +-3, +-3, +-1, +0, +4, +-2, +-1, +-4, +-1, +4, +1, +3, +-2, +-2, +-2, +-1, +3, +-5, +2, +0, +1, +-1, +-4, +4, +0, +-2, +4, +-5, +3, +-3, +-4, +3, +-2, +2, +-2, +3, +3, +4, +0, +-4, +4, +0, +3, +-5, +1, +0, +0, +2, +-3, +-2, +-3, +4, +-3, +1, +-5, +-5, +2, +-3, +-3, +-2, +-4, +0, +4, +1, +1, +0, +4, +0, +-3, +-3, +-1, +-2, +-5, +4, +-2, +1, +3, +-1, +-1, +-4, +4, +-1, +3, +-3, +2, +2, +3, +1, +2, +1, +2, +4, +-4, +-3, +-2, +-3, +4, +0, +1, +0, +-3, +0, +-2, +-3, +1, +3, +4, +-1, +4, +4, +3, +-4, +-5, +3, +2, +4, +-4, +4, +1, +-2, +2, +-1, +4, +2, +2, +-5, +0, +-2, +-4, +2, +-3, +-1, +4, +0, +4, +4, +1, +4, +4, +-3, +3, +4, +-2, +4, +-4, +2, +-2, +-3, +3, +1, +3, +3, +0, +-4, +1, +-4, +4, +2, +-2, +-5, +-4, +-4, +2, +-1, +-2, +0, +4, +-3, +4, +-5, +3, +-1, +-5, +-4, +-2, +-5, +4, +-2, +3, +-3, +4, +3, +-4, +4, +3, +-3, +4, +-1, +3, +-4, +-5, +1, +-3, +-4, +2, +3, +2, +-2, +4, +3, +2, +-4, +0, +-2, +-5, +-3, +3, +1, +3, +-5, +3, +1, +-5, +1, +-2, +-4, +1, +-5, +4, +1, +0, +0, +-4, +-2, +2, +0, +-3, +-4, +-1, +-5, +-2, +-4, +-2, +3, +4, +-5, +0, +2, +2, +4, +0, +4, +-5, +-3, +-5, +-5, +0, +0, +-4, +3, +1, +1, +-5, +2, +-5, +2, +-4, +3, +4, +-3, +-4, +4, +-2, +-4, +4, +-3, +-5, +-4, +2, +-2, +3, +-2, +1, +4, +1, +4, +-2, +1, +-4, +4, +2, +-1, +1, +1, +-2, +-2, +0, +-5, +2, +4, +-4, +1, +-5, +0, +4, +3, +-4, +1, +-3, +-4, +2, +-3, +-3, +1, +2, +4, +-3, +-2, +3, +-3, +-5, +4, +-5, +-5, +4, +4, +3, +-1, +-3, +-4, +-1, +1, +0, +1, +4, +-1, +0, +1, +-4, +0, +-4, +-3, +0, +-4, +2, +-2, +-2, +-1, +-2, +0, +-5, +-1, +-3, +3, +4, +-4, +4, +4, +0, +-4, +-3, +-3, +4, +0, +2, +3, +-1, +2, +0, +-4, +3, +-1, +-3, +-4, +4, +0, +4, +3, +0, +-3, +-5, +4, +-3, +3, +4, +3, +-1, +3, +-3, +1, +3, +4, +3, +-1, +3, +2, +4, +4, +-2, +-1, +2, +-4, +-4, +-1, +0, +-1, +4, +-4, +4, +2, +0, +0, +1, +-1, +-5, +1, +-5, +2, +-2, +2, +1, +3, +1, +-3, +-4, +-2, +0, +4, +-2, +0, +-1, +-1, +-3, +-2, +4, +3, +4, +2, +3, +4, +-4, +-2, +-1, +-2, +0, +-1, +-1, +-4, +-2, +2, +-5, +4, +-5, +0, +4, +4, +-2, +-4, +1, +-5, +-5, +4, +-3, +0, +-1, +-5, +4, +-4, +0, +4, +4, +-4, +-2, +-4, +1, +-4, +-5, +4, +-5, +-3, +2, +-2, +-1, +-3, +0, +-3, +-2, +4, +-5, +-5, +3, +-5, +-1, +3, +2, +3, +0, +-2, +-3, +-4, +3, +0, +3, +3, +-1, +-4, +0, +3, +-5, +-3, +4, +-3, +2, +-1, +0, +2, +-1, +3, +-1, +-5, +3, +4, +3, +3, +4, +3, +1, +-4, +1, +-2, +4, +-3, +2, +0, +-2, +-5, +-5, +-2, +-2, +0, +4, +0, +-4, +-1, +0, +-5, +-3, +4, +0, +1, +-3, +3, +-2, +-2, +2, +-1, +-1, +-3, +-1, +-2, +-5, +-5, +-1, +-1, +-4, +-4, +}; + +const short m_b[] = { +-2, +3, +2, +-5, +-3, +2, +0, +-5, +1, +-3, +-2, +-3, +-3, +3, +-4, +-2, +-4, +1, +1, +1, +4, +4, +3, +3, +-4, +-3, +-2, +-2, +-3, +-2, +-1, +-5, +-4, +1, +-4, +-3, +3, +-5, +-5, +-4, +-2, +-2, +1, +3, +3, +3, +4, +0, +1, +0, +4, +3, +-3, +3, +-1, +0, +2, +-1, +-1, +0, +0, +3, +-4, +4, +3, +3, +-1, +-4, +-1, +-4, +-2, +-4, +-4, +3, +0, +3, +-4, +2, +0, +0, +-4, +-5, +-4, +4, +0, +-4, +-2, +-4, +3, +4, +-4, +-3, +-2, +0, +-2, +2, +-5, +2, +1, +2, +0, +0, +-2, +-2, +-1, +1, +-1, +3, +3, +-1, +3, +-1, +-2, +1, +-3, +3, +3, +3, +2, +0, +1, +-2, +1, +4, +3, +-2, +-5, +0, +2, +0, +1, +-3, +-1, +3, +-3, +-2, +3, +4, +-2, +-1, +-5, +1, +-1, +2, +-3, +-4, +0, +-1, +0, +2, +3, +4, +-4, +-1, +-1, +2, +-1, +-5, +1, +-3, +0, +-4, +0, +-1, +2, +-5, +-1, +1, +-2, +-5, +-3, +-3, +2, +2, +-4, +2, +-5, +-4, +4, +4, +-1, +1, +2, +-1, +-3, +0, +2, +4, +-3, +-2, +-1, +4, +-5, +4, +0, +-1, +-1, +-4, +-3, +-3, +-1, +-2, +-1, +-2, +-5, +2, +2, +-3, +1, +-1, +-2, +0, +0, +1, +0, +4, +-4, +-4, +-4, +-5, +4, +-4, +4, +-3, +4, +-5, +-2, +3, +-2, +-4, +1, +2, +-5, +-2, +-5, +-3, +2, +-1, +3, +-4, +-4, +4, +-3, +-5, +0, +-3, +2, +2, +3, +-3, +-3, +-2, +-5, +4, +1, +3, +-1, +-4, +3, +1, +-2, +1, +3, +4, +1, +-2, +0, +0, +-3, +-1, +1, +-1, +0, +-5, +-2, +-1, +1, +-2, +4, +-3, +2, +1, +0, +0, +3, +-4, +2, +1, +0, +3, +-5, +-1, +4, +3, +4, +-2, +-1, +-3, +-4, +3, +3, +-4, +4, +-5, +1, +-1, +0, +3, +4, +3, +-3, +-5, +-3, +-2, +-4, +1, +-2, +-3, +-2, +1, +2, +4, +4, +0, +2, +4, +0, +-1, +1, +-4, +-1, +-3, +-2, +-5, +3, +-3, +-1, +0, +-3, +-3, +-4, +3, +3, +-5, +-2, +-1, +3, +1, +1, +1, +3, +1, +1, +-5, +-2, +2, +4, +3, +-3, +-5, +3, +-4, +0, +2, +-4, +-5, +-3, +-3, +-3, +2, +-1, +-3, +1, +-3, +-1, +-5, +2, +1, +1, +-3, +2, +4, +1, +-5, +3, +0, +-5, +-4, +2, +-5, +3, +-1, +0, +2, +-3, +-1, +-2, +-5, +-4, +0, +-5, +-3, +3, +1, +0, +-1, +-4, +1, +-4, +-5, +-1, +3, +3, +-3, +-1, +4, +-1, +-5, +-4, +-1, +0, +1, +1, +4, +-3, +-2, +4, +1, +-4, +-1, +-3, +2, +-2, +4, +2, +-1, +2, +-4, +-3, +4, +-1, +1, +-4, +0, +1, +-4, +4, +-4, +-5, +-5, +-3, +2, +2, +-1, +0, +-2, +-5, +1, +-1, +-5, +2, +-4, +-5, +-3, +1, +-4, +0, +1, +-5, +0, +-2, +1, +-4, +-5, +3, +0, +1, +2, +4, +-1, +-4, +0, +-4, +0, +-5, +3, +-5, +-5, +2, +2, +3, +-3, +4, +4, +2, +3, +-2, +4, +3, +4, +1, +1, +0, +2, +2, +4, +-1, +-5, +-5, +3, +-5, +-2, +-4, +-4, +0, +4, +3, +0, +4, +-3, +4, +0, +-1, +3, +2, +-4, +2, +-1, +-3, +-5, +-3, +3, +-4, +1, +-3, +3, +-4, +0, +3, +-2, +-1, +2, +-4, +2, +4, +0, +2, +-2, +-5, +0, +-1, +-5, +4, +-4, +3, +-1, +4, +1, +4, +0, +-1, +1, +4, +3, +3, +1, +2, +-3, +-2, +-5, +-2, +1, +-5, +-4, +-5, +-2, +}; + +const short m_exp[] = { +-32, +48, +2, +-56, +46, +-52, +39, +-77, +41, +-50, +-50, +-14, +-45, +-31, +3, +-22, +-52, +-31, +-32, +8, +61, +97, +49, +-12, +12, +-25, +-26, +-15, +15, +45, +-29, +-27, +-7, +-23, +30, +-36, +-56, +27, +-59, +35, +6, +44, +72, +23, +17, +86, +-8, +5, +-25, +41, +-30, +-78, +120, +-1, +69, +-20, +24, +-66, +6, +68, +-59, +2, +27, +-21, +-71, +-61, +-49, +21, +-24, +27, +-47, +29, +71, +37, +-9, +-58, +13, +13, +73, +-9, +7, +28, +23, +35, +46, +-8, +86, +-7, +25, +-65, +-18, +43, +58, +22, +44, +-51, +-33, +10, +-18, +-17, +-62, +24, +18, +-68, +23, +25, +23, +-32, +23, +37, +55, +-16, +24, +4, +-16, +9, +77, +44, +48, +-36, +-81, +-100, +-18, +36, +-3, +-72, +-43, +38, +-2, +34, +53, +-42, +-18, +56, +19, +34, +3, +-58, +-16, +-34, +-71, +-12, +12, +-22, +-9, +57, +-46, +-70, +25, +-62, +10, +-5, +68, +-86, +-15, +-19, +-100, +78, +23, +47, +5, +-64, +31, +-16, +-9, +-22, +11, +30, +-74, +31, +50, +38, +-59, +24, +12, +-18, +4, +33, +-24, +-44, +-19, +2, +-47, +48, +-30, +28, +-38, +-29, +20, +52, +-21, +-5, +49, +-41, +-17, +-2, +68, +-2, +122, +-29, +13, +23, +46, +40, +-12, +-58, +59, +2, +15, +-49, +-38, +-15, +46, +-1, +-54, +-73, +10, +-1, +43, +80, +-55, +119, +82, +22, +-12, +-14, +-12, +-3, +36, +27, +-3, +50, +59, +61, +13, +-14, +5, +51, +-54, +-45, +-9, +109, +10, +-28, +1, +0, +-50, +-23, +27, +49, +16, +3, +23, +43, +31, +29, +-27, +-45, +9, +61, +42, +34, +30, +26, +38, +-84, +5, +98, +13, +55, +15, +56, +63, +22, +80, +10, +52, +52, +49, +101, +37, +71, +-20, +-38, +-36, +-72, +-40, +12, +77, +-19, +32, +9, +-15, +85, +-13, +-33, +-8, +-20, +33, +36, +76, +6, +67, +-51, +36, +76, +-48, +30, +42, +-45, +-22, +-9, +65, +-29, +18, +89, +34, +-25, +41, +30, +26, +49, +34, +-6, +-28, +10, +-106, +6, +86, +5, +-3, +-105, +-103, +-45, +-55, +-60, +29, +41, +54, +-29, +36, +-30, +-8, +25, +36, +40, +22, +-35, +-50, +33, +3, +-19, +-30, +3, +-11, +9, +-4, +-18, +39, +92, +-25, +0, +-40, +64, +-18, +58, +-2, +27, +-89, +32, +-2, +56, +79, +-67, +87, +-41, +20, +29, +4, +51, +45, +26, +-54, +8, +-37, +-37, +-39, +-35, +-36, +-23, +-30, +11, +1, +-63, +-21, +5, +13, +35, +-15, +27, +-59, +-10, +29, +4, +-58, +-56, +3, +-10, +27, +0, +29, +-4, +-17, +28, +2, +17, +9, +-27, +53, +-19, +-36, +104, +25, +38, +84, +-29, +42, +-10, +-47, +-36, +1, +10, +43, +-46, +-57, +38, +44, +-9, +26, +-15, +10, +31, +34, +-18, +17, +24, +-3, +5, +11, +4, +23, +5, +-19, +51, +-59, +-29, +62, +-27, +35, +81, +65, +-10, +-1, +-1, +45, +20, +26, +-34, +-7, +35, +-7, +2, +19, +115, +-32, +-40, +-92, +47, +-65, +23, +53, +-31, +22, +20, +45, +91, +64, +29, +49, +30, +74, +-15, +22, +120, +16, +122, +-51, +51, +78, +60, +-53, +40, +-49, +73, +-2, +14, +-17, +-86, +35, +-25, +36, +2, +5, +38, +-9, +16, +-12, +-41, +-24, +-31, +-72, +-68, +-9, +5, +11, +-63, +15, +12, +21, +22, +19, +62, +-28, +122, +12, +28, +-59, +-19, +49, +57, +-12, +29, +-1, +-23, +126, +75, +-29, +-21, +-37, +8, +-70, +-8, +91, +-34, +-10, +31, +78, +44, +59, +23, +66, +42, +38, +6, +19, +10, +1, +57, +38, +71, +-10, +-12, +-7, +-13, +-15, +6, +}; + +#define SIZE 24 +__attribute__ ((section(".heapsram"))) short g_mA[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) short g_mB[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) short g_mC[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) short g_mB_tmp[SIZE][SIZE]; diff --git a/astral/parMatrixMul32/Makefile b/astral/parMatrixMul32/Makefile new file mode 100755 index 0000000..bd55e15 --- /dev/null +++ b/astral/parMatrixMul32/Makefile @@ -0,0 +1,8 @@ +PULP_APP = test +PULP_APP_SRCS = matrixMul.c + +PULP_CFLAGS = -O3 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk + +#pulp-bench-reg --name=parMatrixMul32.cycles --module=pulp_rtl_testset --pipeline=$(PIPELINE) --artefact=pulp_rtl_testset --cmd="make run -f Makefile.sdk" --probe-regexp='matrixMul -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(32)" --probe-regexp='matrixMulTranspose -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(32),transposed" diff --git a/astral/parMatrixMul32/gen_stimuli.py b/astral/parMatrixMul32/gen_stimuli.py new file mode 100755 index 0000000..32926e9 --- /dev/null +++ b/astral/parMatrixMul32/gen_stimuli.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import sys +import random + + +def write_arr(f, name, arr): + f.write('const int %s[] = {\n' % name) + for v in arr: + f.write('%d,\n' % (v)) + f.write('};\n\n') + return + +################################################################################ +f = open('parMatrixMul32_stimuli.h', 'w') + + +SIZE = 24 +RANGE = int(2**15/SIZE) + +m_a = [] +m_b = [] +m_exp = [] + +for i in range(0,SIZE): + for j in range(0,SIZE): + a = random.randint(-RANGE, RANGE-1) + b = random.randint(-RANGE, RANGE-1) + + m_a.append(a) + m_b.append(b) + +for i in range(0,SIZE): + for j in range(0,SIZE): + r = 0 + + for k in range (0,SIZE): + r = r + m_a[i * SIZE + k] * m_b[k * SIZE + j] + + m_exp.append(r) + + +write_arr(f, 'm_a', m_a) +write_arr(f, 'm_b', m_b) +write_arr(f, 'm_exp', m_exp) + +f.write('#define SIZE %d\n' % SIZE) + + +f.write('__attribute__ ((section(".heapsram"))) int g_mA[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) int g_mB[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) int g_mC[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) int g_mB_tmp[SIZE][SIZE];\n') + diff --git a/astral/parMatrixMul32/matrixMul.c b/astral/parMatrixMul32/matrixMul.c new file mode 100644 index 0000000..990d411 --- /dev/null +++ b/astral/parMatrixMul32/matrixMul.c @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Mantainer: Luca Valente, luca.valente2@unibo.it + */ + +#include "pulp.h" + +#include "parMatrixMul32_stimuli.h" + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()); +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()); + +testcase_t testcases[] = { + { .name = "matrixMul", .test = check_matrix_mul }, + { .name = "matrixMulTranspose", .test = check_matrix_mul_transpose }, + {0, 0} +}; + +unsigned int num_cores; + +int main() +{ + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + num_cores = get_core_num(); + + if(rt_core_id() < num_cores) { + run_suite(testcases); + } + + synch_barrier(); + + return 0; +} + +void matrix_init(); +unsigned int matrix_check(); + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB[k][j]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + // transpose array before using it + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mB_tmp[i][j] = g_mB[j][i]; + } + } + + if(num_cores != 1) synch_barrier(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB_tmp[j][k]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void matrix_init() { + unsigned int i, j; + + // init, copy to TCDM + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + g_mA[i][j] = m_a[i * SIZE + j]; + g_mB[i][j] = m_b[i * SIZE + j]; + g_mC[i][j] = 0; + } + } +} + +unsigned int matrix_check() { + unsigned int errors = 0; + unsigned int i, j; + // check + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + if(g_mC[i][j] != m_exp[i * SIZE + j]) { + printf("At index %d, %d\n", i, j, 0, 0); + errors++; + } + } + } + + return errors; +} diff --git a/astral/parMatrixMul32/parMatrixMul32_stimuli.h b/astral/parMatrixMul32/parMatrixMul32_stimuli.h new file mode 100644 index 0000000..d5c1af0 --- /dev/null +++ b/astral/parMatrixMul32/parMatrixMul32_stimuli.h @@ -0,0 +1,1742 @@ +const int m_a[] = { +-1344, +-778, +-350, +1240, +950, +940, +1262, +285, +738, +-37, +257, +704, +87, +-1135, +1175, +960, +984, +611, +-1170, +1243, +-729, +-1235, +-1229, +8, +52, +1156, +-804, +-129, +-112, +-307, +1045, +-954, +-944, +477, +1104, +-600, +-31, +1364, +-950, +-153, +967, +-446, +235, +-197, +763, +-660, +-1289, +116, +-458, +994, +1086, +156, +-549, +102, +-532, +943, +299, +340, +516, +1117, +515, +1335, +708, +-306, +-589, +517, +655, +-279, +-595, +-255, +-1104, +-16, +507, +199, +-297, +-905, +-1084, +982, +20, +-1364, +407, +333, +-798, +-711, +1301, +175, +490, +-165, +-1097, +1251, +1203, +-884, +419, +-1262, +-950, +-1200, +759, +-205, +-1365, +-870, +109, +-1135, +36, +796, +-1233, +-1117, +-826, +241, +588, +-513, +-353, +791, +1071, +-1073, +220, +-1094, +1096, +-723, +280, +-505, +151, +399, +319, +-1120, +-213, +-966, +679, +497, +-290, +-300, +-290, +-599, +243, +-752, +604, +1196, +-715, +-177, +-329, +1337, +596, +1027, +509, +-301, +-1211, +-936, +-281, +446, +-356, +841, +-1123, +-1343, +-140, +-1300, +-828, +-237, +1206, +1274, +-1092, +-922, +913, +1201, +-422, +782, +-230, +633, +-1055, +-1160, +329, +1255, +1038, +770, +123, +934, +670, +-492, +-24, +-779, +-1129, +47, +555, +1214, +-232, +-716, +-322, +-126, +178, +827, +710, +-1057, +-313, +996, +1149, +-532, +570, +1171, +899, +-313, +-790, +1071, +154, +-303, +492, +-918, +-1139, +292, +129, +1347, +-309, +751, +1262, +142, +-1062, +-1305, +250, +657, +238, +-141, +1308, +-37, +-514, +-591, +-611, +852, +-653, +-640, +91, +254, +-1145, +-1263, +-838, +-10, +266, +-444, +1129, +762, +-713, +-326, +-88, +1063, +-442, +-177, +365, +-740, +-1219, +1085, +783, +-725, +-1112, +426, +660, +6, +-440, +513, +687, +1078, +212, +-434, +-953, +1337, +160, +622, +-950, +-943, +288, +-136, +-1103, +-223, +1271, +211, +251, +-271, +-26, +704, +1177, +544, +699, +-885, +-864, +-1280, +877, +-461, +995, +-623, +-121, +-146, +-484, +-225, +-978, +163, +-278, +-502, +-505, +-567, +-771, +1279, +699, +-1337, +544, +1145, +1271, +640, +277, +-164, +458, +-1280, +-602, +-2, +1136, +1203, +-699, +-195, +659, +-472, +1230, +1151, +-97, +-77, +-772, +-381, +-295, +636, +-1341, +-445, +-806, +531, +-1186, +-1313, +-274, +835, +-446, +558, +-1307, +-235, +43, +-254, +-109, +911, +-1189, +559, +-854, +-218, +149, +580, +1158, +-14, +181, +-1120, +-947, +-542, +1142, +631, +-893, +-614, +-257, +-365, +-951, +1, +-762, +268, +382, +-131, +808, +-234, +839, +346, +-733, +1251, +496, +-566, +-751, +581, +-1292, +1068, +-932, +-855, +1336, +-280, +523, +1294, +-1251, +1284, +-1276, +87, +1264, +-274, +-922, +-289, +-458, +-117, +196, +-79, +-707, +1233, +-385, +-620, +-617, +703, +-995, +-374, +660, +145, +821, +1289, +582, +-201, +447, +116, +759, +-615, +834, +268, +-1114, +-1016, +-227, +-589, +-910, +-244, +-660, +764, +219, +1165, +506, +-673, +799, +-1355, +-872, +491, +689, +176, +-285, +1151, +1080, +-319, +286, +833, +217, +-621, +478, +539, +-109, +-1273, +-564, +-240, +504, +518, +256, +-124, +74, +949, +-912, +-1341, +965, +-774, +634, +1009, +1304, +200, +-1041, +-1262, +-865, +-1065, +-635, +-357, +-928, +806, +1148, +-411, +56, +686, +-644, +1241, +-430, +297, +127, +457, +-1313, +741, +861, +220, +-540, +772, +265, +1066, +679, +177, +-734, +29, +-149, +181, +-1042, +-1139, +271, +-326, +-29, +1298, +643, +-890, +-136, +-1015, +-565, +-964, +894, +-312, +698, +159, +-222, +-1322, +578, +945, +1124, +1278, +54, +-389, +1101, +362, +-543, +380, +959, +-399, +-1105, +1308, +338, +-198, +-1111, +-278, +-752, +668, +1156, +-1226, +579, +184, +-1084, +-917, +-498, +-466, +316, +-788, +-718, +468, +367, +-1333, +-1146, +828, +1329, +311, +-1346, +54, +-976, +854, +-658, +-198, +-979, +156, +385, +-659, +1326, +1351, +-1173, +-648, +720, +-40, +313, +729, +-416, +351, +452, +-413, +-4, +-1113, +-612, +-28, +-721, +400, +1072, +-1010, +}; + +const int m_b[] = { +-1316, +319, +963, +-608, +519, +-783, +-676, +181, +172, +203, +-1351, +-935, +-12, +758, +-746, +1226, +127, +-1346, +1251, +-377, +889, +-23, +-417, +-122, +680, +1363, +729, +-907, +-1263, +-431, +363, +1355, +-566, +-517, +-1186, +1318, +-1104, +-1245, +950, +687, +252, +-270, +1081, +-1290, +656, +8, +60, +1171, +915, +-500, +678, +-953, +307, +-35, +-1334, +-888, +598, +1160, +722, +850, +-268, +988, +635, +-340, +252, +1208, +420, +82, +1283, +-319, +-666, +172, +583, +174, +471, +-1063, +452, +-191, +-1188, +116, +-927, +1086, +119, +-245, +-717, +-657, +417, +319, +1133, +1338, +141, +-546, +567, +-1089, +-191, +-1138, +-201, +-1286, +-820, +-1356, +1177, +-317, +191, +67, +164, +-306, +-1015, +1147, +-482, +1229, +-259, +-207, +1309, +847, +-399, +-1005, +-995, +140, +-567, +-1220, +-427, +180, +-571, +997, +-783, +-316, +-1360, +736, +75, +-1251, +-307, +-902, +1181, +1057, +-141, +-1098, +776, +1096, +-923, +914, +1049, +-28, +-742, +-804, +-467, +567, +329, +-309, +-161, +-157, +-430, +-639, +1138, +-165, +292, +-20, +777, +-715, +60, +-1359, +35, +307, +-1092, +271, +548, +822, +-50, +-475, +-103, +784, +537, +152, +-517, +-1097, +117, +-619, +538, +941, +172, +-223, +1161, +-1004, +-1145, +-455, +255, +363, +859, +403, +-861, +-657, +-537, +-1084, +-1042, +541, +1283, +-356, +1298, +-1254, +-303, +203, +104, +1123, +-72, +-171, +-1122, +-533, +440, +275, +613, +-846, +-189, +884, +704, +-570, +-440, +-1157, +-200, +-80, +616, +799, +757, +-264, +-1256, +-690, +152, +184, +-810, +-221, +-821, +-243, +508, +-709, +574, +-693, +315, +-952, +952, +697, +875, +-480, +-691, +422, +-413, +-1199, +441, +-751, +821, +1303, +-410, +416, +566, +-131, +-551, +46, +978, +-228, +1117, +-251, +-537, +874, +-882, +260, +-213, +248, +-1296, +1343, +-626, +-812, +629, +-601, +-378, +-1314, +-889, +774, +-307, +692, +-1125, +-692, +923, +947, +1158, +-939, +1284, +35, +1299, +369, +-8, +43, +768, +524, +137, +659, +285, +-1315, +-457, +871, +-768, +1107, +-695, +488, +-527, +-161, +414, +-526, +-1164, +1059, +-1108, +560, +-622, +898, +-50, +-286, +-170, +513, +952, +433, +237, +584, +-665, +-960, +585, +-434, +1223, +-130, +1035, +430, +202, +1312, +1152, +1059, +-1082, +-1295, +805, +-18, +613, +-94, +557, +548, +1354, +116, +289, +-1358, +-1234, +1237, +451, +820, +-102, +974, +832, +-1019, +914, +-512, +-267, +1329, +-910, +-1341, +862, +-381, +-23, +-658, +40, +-71, +-782, +1240, +-956, +1241, +-291, +-884, +1250, +699, +834, +190, +960, +-1260, +177, +464, +155, +-1105, +768, +424, +621, +740, +-1357, +1186, +-594, +1329, +829, +126, +1101, +1146, +-95, +605, +-673, +1334, +440, +-10, +12, +-745, +20, +19, +-793, +999, +1083, +487, +-657, +-356, +654, +-326, +-250, +-718, +-947, +-235, +558, +974, +-981, +637, +-861, +-768, +1045, +-583, +-910, +128, +734, +896, +-1156, +223, +284, +272, +634, +-473, +363, +359, +-1185, +14, +-33, +-1122, +140, +900, +439, +-944, +-770, +663, +865, +1056, +-238, +86, +-1294, +-44, +-603, +602, +20, +397, +-423, +-703, +-209, +-906, +-1236, +945, +-737, +578, +904, +645, +1225, +-877, +-425, +-493, +-1326, +424, +965, +1300, +-1210, +823, +1345, +626, +-427, +592, +-869, +-1055, +-938, +-427, +1066, +472, +-1055, +48, +-1200, +-349, +313, +-1227, +-228, +783, +839, +187, +1021, +-1355, +1284, +68, +-1321, +-997, +1286, +-887, +772, +-156, +-105, +1329, +1141, +-377, +-881, +-341, +1316, +-391, +-1249, +-205, +53, +-266, +-540, +-289, +-1011, +602, +-1032, +-1097, +-202, +-467, +-1047, +-867, +-340, +-109, +-496, +967, +1147, +108, +384, +-12, +1216, +137, +1318, +151, +219, +-543, +391, +668, +-1348, +-1244, +-810, +-676, +321, +-1258, +1343, +1214, +791, +35, +1219, +1278, +1037, +-1282, +661, +585, +921, +-880, +-989, +-1192, +-207, +273, +-382, +690, +165, +271, +-212, +739, +-343, +-42, +226, +40, +859, +-153, +622, +-1059, +}; + +const int m_exp[] = { +-4942391, +2289133, +-1363225, +1978230, +1580032, +-625813, +-3230128, +2236653, +19494, +3242695, +-1080745, +-34154, +4086860, +-1370876, +3997221, +-1812380, +4705498, +7690207, +-4068140, +3595067, +-1103308, +-939857, +-4249710, +-8650816, +-2013119, +2933624, +235162, +-453807, +-4447391, +3527041, +2046492, +411956, +-994117, +-1411344, +1333704, +-519761, +3026373, +-564969, +3749147, +2447173, +-557628, +1138674, +-1426096, +-4033488, +-1829685, +2815607, +2382958, +1714081, +-1470484, +3379876, +3660759, +-2439960, +-1180478, +-3300785, +-5104533, +-309753, +-1667400, +3258850, +1805449, +2481948, +-944985, +-363123, +4227063, +3022289, +2763211, +5114077, +-1534394, +-2957168, +3401637, +-1195822, +-747480, +-2915318, +-2505013, +-174927, +276733, +2899369, +-6702856, +923396, +-2741169, +4270685, +-1020657, +-2562887, +2074098, +-2382784, +1366504, +691209, +4127820, +400356, +-7415505, +823772, +-3848400, +-158560, +3759990, +2298445, +323394, +873625, +474364, +2617120, +-1382444, +1735284, +-5799715, +1915577, +7016057, +-1212904, +919286, +2949768, +1228832, +572192, +4145710, +-6809520, +-2199597, +677764, +-7169579, +-4904277, +6902014, +231123, +-4797299, +3093608, +989455, +4324476, +3121268, +810907, +-2457323, +2299211, +-1625774, +-141013, +3343022, +-2044657, +4089375, +-291323, +-1950307, +2480885, +2846731, +-2139146, +-2718414, +-1997531, +-2399245, +-4060224, +423228, +-205276, +-1602384, +910872, +-3535114, +6008729, +-559984, +-301205, +-5407307, +2981269, +1079061, +6602535, +-857708, +2756391, +-5304566, +-3769267, +2620777, +-4409088, +502077, +7568647, +-37918, +2315061, +-2540065, +8296540, +-7465282, +1553910, +-4736227, +-2139045, +95614, +342546, +1309722, +1777391, +2981296, +-736899, +-2572111, +-972463, +-2793724, +-2893912, +-1230264, +-871649, +-1439985, +3074445, +3339004, +2423842, +1751086, +4064832, +1550243, +6225792, +-503750, +-2567772, +5671219, +-2054796, +-551487, +-2787790, +-3835027, +-4272806, +2725813, +-2982521, +1803437, +3024675, +-201092, +-1626608, +1548043, +2303810, +3032912, +398283, +1704371, +1860306, +-4101665, +5187913, +4233418, +2054883, +3603470, +1935132, +-17548, +-4362444, +-2806918, +-5651039, +-1853372, +-1707208, +-153048, +-2791834, +402265, +2815962, +3391662, +-4833520, +-1190520, +2302448, +80738, +2089586, +174096, +2837490, +-5514606, +2138871, +336249, +-378675, +5833977, +4367000, +-2445147, +-3652299, +-1794451, +-1471577, +-1263012, +3719274, +-3404819, +5765304, +-4256415, +3558206, +-1884441, +-475244, +3659623, +-2914867, +689238, +-2576754, +7739914, +1823902, +2077002, +-2365242, +2023481, +1663749, +-4973435, +-694558, +1118078, +2260786, +-3256285, +4596746, +-5421599, +594942, +-1730692, +-4626077, +3077882, +-2232009, +2672161, +3135747, +-4602601, +859784, +-3530668, +21600, +-4690786, +2023164, +-496745, +-2728919, +281474, +-108745, +-809613, +1445687, +-5781458, +2097169, +1594266, +-4504019, +2460482, +6259537, +-700848, +413263, +-1212884, +-5695130, +2094147, +-750529, +-1379008, +6029072, +722889, +1719449, +1682336, +-4219755, +1971162, +66575, +-1195119, +141466, +-1083536, +-718558, +-4039954, +-168429, +-2026861, +2025800, +-761083, +-4194692, +2013337, +-1156936, +3823019, +4081732, +-3145845, +-1733615, +-1371947, +-3811245, +-1584663, +-3547009, +-3267886, +8255291, +-3232160, +3404636, +3248369, +3233853, +671601, +-1009897, +1821121, +-3517645, +2005444, +-2768741, +115998, +533867, +4717709, +1315923, +-3510545, +-3539595, +-538461, +4529529, +2792584, +-107486, +-1840413, +-1474849, +1579605, +-4197602, +-34825, +-462678, +1294881, +-1730927, +-2549709, +-1531672, +-271859, +-1181904, +-1680154, +-2321723, +-6641222, +1127764, +893535, +-2804646, +5653509, +2657606, +-1751466, +-4669812, +-827592, +-126901, +-2599752, +-845148, +1390838, +8975481, +-7663778, +3572438, +5920790, +5233883, +-613590, +-881500, +-3974422, +-5523348, +-3243204, +-6405765, +-4376438, +1352634, +-105650, +2650174, +1442151, +5088231, +2974595, +-4501663, +-841006, +-3101819, +-1265401, +-2756903, +2579743, +2045040, +-5328835, +2801176, +-386694, +-3068782, +3147225, +-248211, +-662659, +-1112717, +2733193, +336344, +3107302, +2244003, +4285762, +1998904, +1888720, +-1174981, +-2567532, +-5588952, +-101948, +4004848, +-610048, +793760, +3345423, +716318, +1033698, +4011882, +-965219, +1258434, +1579522, +-4249500, +3233648, +-424838, +2640541, +1020028, +4933599, +-1964947, +3237309, +-1251962, +-437406, +-2749192, +-2943112, +-117113, +778507, +2757711, +3478291, +-661571, +1077087, +-3821174, +2731860, +3035264, +-4424379, +295413, +3873542, +-1272809, +4145370, +-363272, +2240544, +88954, +-2016552, +-862779, +-844808, +3142493, +2019692, +3648148, +3857820, +593190, +1285134, +-4257140, +-1476035, +-1951773, +-2334649, +1355368, +-4390456, +3666652, +562848, +-8226958, +1134896, +1136697, +-2132899, +3300228, +1855661, +6476864, +5097743, +-1373818, +3287769, +1709294, +-2926119, +2463141, +400199, +3051372, +1815531, +1746372, +398117, +2333959, +-708565, +-4241370, +51697, +-1626217, +26865, +2248300, +3357859, +-325912, +194201, +612298, +388227, +256018, +-5630155, +-1085451, +653494, +-1966315, +-273079, +-4296295, +-2813232, +2079672, +2378463, +-3869089, +-438799, +-725265, +3152791, +3461913, +-777750, +47521, +2588203, +1888001, +-4445421, +654349, +811737, +418334, +-1854075, +-5194402, +-1571674, +-622026, +-1091628, +1787463, +3439585, +2923276, +-1997884, +-193963, +-731696, +3686658, +-1311796, +-5219031, +-2906251, +2140229, +-1846978, +2541247, +-3677377, +-3935140, +3605308, +4807232, +-1633864, +344286, +-2051894, +2498349, +-3085, +379207, +-701595, +-1080351, +3161365, +-1606976, +1640595, +3757649, +798095, +-3167055, +-2288739, +2301831, +-3324819, +3219538, +516049, +-3153835, +7342606, +1098913, +-2522436, +376783, +47367, +530901, +-395499, +304200, +}; + +#define SIZE 24 +__attribute__ ((section(".heapsram"))) int g_mA[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) int g_mB[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) int g_mC[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) int g_mB_tmp[SIZE][SIZE]; diff --git a/astral/parMatrixMul8/Makefile b/astral/parMatrixMul8/Makefile new file mode 100755 index 0000000..7c755b8 --- /dev/null +++ b/astral/parMatrixMul8/Makefile @@ -0,0 +1,8 @@ +PULP_APP = test +PULP_APP_SRCS = matrixMul.c + +PULP_CFLAGS = -O3 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk + +#pulp-bench-reg --name=parMatrixMul8.cycles --module=pulp_rtl_testset --pipeline=$(PIPELINE) --artefact=pulp_rtl_testset --cmd="make run -f Makefile.sdk" --probe-regexp='matrixMul -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(8)" --probe-regexp='matrixMulTranspose -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(8),transposed" diff --git a/astral/parMatrixMul8/gen_stimuli.py b/astral/parMatrixMul8/gen_stimuli.py new file mode 100755 index 0000000..153d5c3 --- /dev/null +++ b/astral/parMatrixMul8/gen_stimuli.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import sys +import random + + +def write_arr(f, name, arr): + f.write('const char %s[] = {\n' % name) + for v in arr: + f.write('%d,\n' % (v)) + f.write('};\n\n') + return + +################################################################################ +f = open('parMatrixMul8_stimuli.h', 'w') + + +SIZE = 24 +RANGE = 4 + +m_a = [] +m_b = [] +m_exp = [] + +for i in range(0,SIZE): + for j in range(0,SIZE): + a = random.randint(-RANGE, RANGE-1) + b = random.randint(-RANGE, RANGE-1) + + m_a.append(a) + m_b.append(b) + +for i in range(0,SIZE): + for j in range(0,SIZE): + r = 0 + + for k in range (0,SIZE): + r = r + m_a[i * SIZE + k] * m_b[k * SIZE + j] + + m_exp.append(r) + + +write_arr(f, 'm_a', m_a) +write_arr(f, 'm_b', m_b) +write_arr(f, 'm_exp', m_exp) + +f.write('#define SIZE %d\n' % SIZE) + + +f.write('__attribute__ ((section(".heapsram"))) char g_mA[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) char g_mB[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) char g_mC[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) char g_mB_tmp[SIZE][SIZE];\n') + diff --git a/astral/parMatrixMul8/matrixMul.c b/astral/parMatrixMul8/matrixMul.c new file mode 100644 index 0000000..357fdf0 --- /dev/null +++ b/astral/parMatrixMul8/matrixMul.c @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Mantainer: Luca Valente, luca.valente2@unibo.it + */ + +#include "pulp.h" + +#include "parMatrixMul8_stimuli.h" + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()); +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()); + +testcase_t testcases[] = { + { .name = "matrixMul", .test = check_matrix_mul }, + { .name = "matrixMulTranspose", .test = check_matrix_mul_transpose }, + {0, 0} +}; + +unsigned int num_cores; + +int main() +{ + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + num_cores = get_core_num(); + + if(rt_core_id() < num_cores) { + run_suite(testcases); + } + + synch_barrier(); + + return 0; +} + +void matrix_init(); +unsigned int matrix_check(); + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB[k][j]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + // transpose array before using it + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mB_tmp[i][j] = g_mB[j][i]; + } + } + + if(num_cores != 1) synch_barrier(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB_tmp[j][k]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void matrix_init() { + unsigned int i, j; + + // init, copy to TCDM + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + g_mA[i][j] = m_a[i * SIZE + j]; + g_mB[i][j] = m_b[i * SIZE + j]; + g_mC[i][j] = 0; + } + } +} + +unsigned int matrix_check() { + unsigned int errors = 0; + unsigned int i, j; + // check + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + if(g_mC[i][j] != m_exp[i * SIZE + j]) { + printf("At index %d, %d\n", i, j, 0, 0); + errors++; + } + } + } + + return errors; +} diff --git a/astral/parMatrixMul8/parMatrixMul8_stimuli.h b/astral/parMatrixMul8/parMatrixMul8_stimuli.h new file mode 100644 index 0000000..78d026c --- /dev/null +++ b/astral/parMatrixMul8/parMatrixMul8_stimuli.h @@ -0,0 +1,1742 @@ +const char m_a[] = { +-1, +0, +0, +0, +-4, +-4, +-4, +1, +-1, +-2, +-3, +3, +2, +-4, +1, +-2, +-3, +0, +-3, +-4, +-3, +-1, +-3, +-3, +2, +-2, +2, +3, +0, +2, +0, +-1, +-3, +-2, +-1, +-3, +1, +-1, +-1, +0, +-1, +-2, +-1, +-4, +-2, +-3, +-1, +-2, +1, +1, +2, +-1, +0, +2, +1, +-3, +2, +-3, +0, +0, +0, +2, +-2, +3, +-4, +2, +-2, +-3, +-2, +-3, +3, +-3, +0, +1, +-1, +-1, +1, +-2, +-1, +-4, +-2, +-1, +-4, +-4, +-2, +-1, +-2, +-3, +-4, +-4, +1, +0, +-2, +1, +-1, +1, +2, +-3, +-4, +0, +2, +-3, +-3, +0, +0, +-2, +-3, +-1, +-2, +-1, +-4, +-1, +-4, +-3, +-3, +0, +1, +-3, +-4, +1, +2, +0, +2, +-1, +-4, +-1, +-4, +2, +2, +-1, +3, +0, +0, +-1, +1, +3, +0, +2, +0, +3, +2, +2, +2, +-3, +2, +2, +-4, +0, +-3, +-1, +-1, +-3, +-4, +3, +2, +0, +2, +1, +0, +-2, +-1, +3, +3, +0, +1, +-3, +3, +0, +-3, +2, +-3, +1, +2, +0, +3, +-1, +0, +-2, +-4, +0, +-3, +0, +3, +0, +-1, +-3, +0, +-2, +-4, +-4, +-1, +1, +2, +-4, +1, +-4, +-4, +-3, +-3, +3, +-1, +-2, +-4, +2, +1, +2, +-1, +1, +3, +0, +2, +-4, +-3, +1, +-2, +0, +1, +1, +3, +-2, +-4, +2, +-1, +-1, +1, +2, +-2, +1, +1, +2, +0, +3, +-4, +2, +-1, +-1, +-3, +-3, +-3, +-3, +2, +-3, +1, +0, +3, +-3, +3, +3, +-4, +1, +-4, +1, +2, +-3, +1, +-4, +-1, +-4, +-3, +-2, +2, +3, +0, +-4, +1, +3, +0, +3, +-1, +-4, +3, +-2, +-2, +2, +-4, +-1, +-1, +3, +-2, +-4, +-1, +-1, +2, +-1, +2, +0, +-2, +-4, +0, +2, +-4, +0, +1, +-4, +2, +0, +-1, +-1, +-3, +2, +3, +2, +-2, +1, +-4, +-3, +1, +2, +-1, +-4, +3, +-1, +2, +-2, +-3, +0, +-1, +3, +-4, +-3, +-3, +-3, +0, +0, +3, +1, +-2, +-2, +-4, +2, +2, +0, +3, +-2, +1, +3, +-1, +3, +-4, +-2, +0, +-4, +0, +-1, +-3, +-2, +-4, +-4, +-2, +-4, +-3, +-3, +0, +2, +-4, +1, +0, +-3, +0, +-4, +-4, +3, +3, +-2, +1, +3, +3, +1, +-3, +-4, +-4, +1, +1, +2, +-4, +0, +3, +-1, +3, +1, +-4, +-2, +1, +-1, +3, +3, +-1, +-1, +0, +-2, +2, +-2, +-2, +1, +3, +1, +3, +-1, +2, +-1, +0, +1, +2, +-4, +-3, +1, +-3, +-1, +-4, +-1, +-4, +-4, +-3, +-3, +3, +0, +3, +0, +-2, +3, +-3, +3, +-1, +-1, +2, +-2, +1, +-4, +0, +-3, +3, +0, +-2, +-1, +3, +2, +-2, +3, +-3, +0, +-2, +1, +0, +-1, +1, +3, +-4, +3, +0, +3, +-4, +3, +-1, +-1, +1, +3, +1, +0, +-4, +3, +-3, +-4, +-1, +2, +-3, +-4, +-2, +-3, +2, +3, +-2, +-2, +-4, +-1, +1, +-4, +-2, +2, +-3, +1, +-3, +-2, +2, +3, +-2, +0, +1, +-1, +2, +-2, +1, +-1, +-4, +-3, +-4, +-3, +-3, +-3, +-4, +3, +1, +1, +3, +-4, +0, +-4, +2, +2, +3, +2, +-4, +1, +-1, +-2, +-2, +2, +-2, +2, +-3, +2, +0, +0, +0, +-4, +-2, +-2, +2, +1, +0, +2, +3, +2, +3, +-3, +0, +-3, +-2, +2, +-1, +2, +-3, +1, +0, +1, +3, +-4, +0, +-3, +-1, +1, +-2, +-2, +0, +0, +1, +-3, +-2, +0, +3, +2, +0, +3, +-4, +2, +1, +3, +-2, +-2, +-3, +2, +2, +2, +-1, +3, +2, +-3, +}; + +const char m_b[] = { +3, +-2, +-3, +-4, +-3, +2, +-4, +-1, +-4, +-4, +-1, +-3, +3, +0, +2, +1, +2, +-2, +0, +3, +-4, +-2, +2, +-1, +1, +-4, +-4, +-1, +1, +-3, +-2, +1, +-3, +1, +-2, +2, +1, +-3, +-1, +-4, +1, +-1, +-3, +3, +0, +0, +-2, +0, +-4, +-3, +0, +-3, +0, +3, +-1, +-4, +-2, +2, +-4, +-1, +-3, +1, +-4, +-4, +0, +2, +3, +2, +0, +-3, +-1, +-1, +2, +2, +-2, +-2, +-2, +-4, +3, +-3, +2, +1, +-1, +1, +-3, +-1, +2, +-1, +1, +-1, +0, +-4, +-2, +3, +0, +-4, +-2, +3, +-3, +1, +-1, +-1, +-2, +1, +-2, +-4, +0, +0, +0, +0, +-4, +-3, +3, +-2, +3, +3, +2, +3, +-4, +1, +-2, +-3, +-3, +2, +3, +-4, +-1, +3, +0, +3, +2, +1, +-1, +-1, +-4, +-1, +3, +2, +-1, +3, +0, +3, +0, +1, +-4, +0, +3, +2, +-4, +-2, +-2, +-4, +-3, +-2, +1, +0, +-2, +1, +3, +-2, +-1, +-3, +0, +3, +0, +0, +0, +2, +3, +-4, +-4, +2, +2, +3, +1, +0, +0, +-2, +1, +-2, +-4, +-3, +0, +-2, +2, +3, +2, +3, +3, +1, +1, +1, +3, +1, +3, +2, +-3, +1, +-1, +-2, +-2, +-2, +-1, +2, +2, +0, +-3, +-1, +2, +-4, +-2, +2, +-3, +-2, +-4, +3, +-1, +3, +2, +3, +-1, +-1, +-1, +-2, +-2, +2, +2, +-1, +-3, +-1, +3, +-3, +3, +3, +2, +-3, +2, +-3, +-3, +-2, +3, +-1, +0, +-2, +-1, +-1, +2, +-1, +-2, +1, +-4, +1, +3, +2, +1, +2, +-1, +-1, +-1, +-4, +0, +3, +1, +-2, +0, +2, +-2, +-2, +3, +-1, +-1, +0, +3, +3, +-1, +0, +0, +-4, +1, +-4, +-4, +0, +2, +3, +-3, +-2, +-3, +-3, +-2, +0, +-4, +-1, +0, +-2, +1, +-1, +-4, +1, +2, +0, +-2, +0, +2, +2, +2, +3, +-3, +1, +0, +-2, +2, +3, +-1, +-2, +1, +-3, +-1, +2, +2, +1, +3, +-2, +0, +-2, +-4, +1, +1, +1, +-2, +-1, +2, +0, +1, +-1, +-3, +-1, +1, +-1, +-1, +-3, +-4, +-2, +-2, +-1, +0, +3, +3, +0, +-2, +-2, +-2, +-3, +2, +2, +1, +3, +0, +3, +0, +-1, +-1, +3, +3, +-4, +1, +1, +-2, +-4, +-4, +3, +1, +0, +-1, +-4, +-2, +2, +0, +1, +-1, +0, +-3, +-2, +-1, +1, +-3, +-2, +2, +-4, +-3, +-3, +0, +0, +-3, +-3, +2, +0, +1, +-2, +-3, +-1, +3, +-1, +3, +-3, +-4, +-4, +0, +-4, +-1, +2, +1, +0, +0, +2, +2, +3, +2, +-1, +0, +-3, +-3, +3, +0, +-4, +0, +-2, +-2, +-1, +1, +3, +3, +1, +-3, +1, +-2, +-1, +2, +0, +0, +2, +-1, +-4, +-1, +-3, +0, +2, +-4, +-1, +0, +-2, +-1, +2, +-2, +-2, +3, +3, +0, +3, +1, +0, +2, +3, +2, +-2, +-4, +-1, +3, +3, +-1, +2, +0, +-2, +2, +-4, +-3, +-3, +-3, +-3, +-3, +0, +3, +3, +3, +0, +-3, +-1, +2, +3, +-3, +-3, +-4, +-1, +3, +0, +-4, +-2, +-3, +1, +3, +3, +-2, +2, +-4, +-2, +1, +-3, +-3, +-2, +3, +-3, +0, +-4, +0, +2, +-4, +0, +1, +-1, +3, +3, +3, +1, +3, +-4, +-4, +-1, +-3, +-3, +0, +1, +-1, +-3, +-4, +2, +3, +0, +-4, +-2, +-3, +0, +3, +2, +0, +-2, +-4, +-3, +-3, +-3, +-2, +2, +-4, +-4, +2, +3, +-3, +2, +-2, +-2, +-1, +-2, +-2, +-1, +3, +-4, +-4, +3, +-3, +-3, +-1, +-1, +2, +3, +-3, +-1, +-3, +-4, +}; + +const char m_exp[] = { +27, +-26, +-19, +13, +47, +17, +-5, +14, +88, +66, +2, +14, +6, +-39, +56, +35, +-19, +61, +-13, +4, +46, +1, +70, +27, +-22, +-28, +-13, +-5, +-4, +-6, +-10, +5, +26, +14, +3, +-1, +-3, +0, +10, +43, +4, +32, +-3, +6, +36, +26, +66, +20, +-20, +-10, +21, +-17, +-5, +36, +-9, +11, +10, +17, +-32, +12, +20, +9, +-1, +15, +-42, +-21, +-51, +24, +20, +26, +10, +42, +-4, +2, +-11, +41, +22, +4, +-17, +10, +28, +-10, +2, +10, +25, +32, +18, +43, +-3, +-9, +-9, +-4, +25, +-5, +9, +24, +54, +43, +14, +59, +18, +36, +-7, +30, +50, +-30, +21, +3, +27, +7, +21, +55, +-17, +-2, +9, +17, +31, +8, +12, +29, +43, +-17, +14, +-55, +13, +50, +24, +-13, +-33, +15, +-45, +9, +36, +-9, +-6, +14, +-1, +2, +-27, +-37, +-40, +-14, +2, +-10, +6, +27, +-8, +-13, +-17, +-4, +28, +51, +-5, +-14, +2, +-23, +17, +35, +41, +-3, +-37, +-12, +-49, +-31, +7, +15, +36, +-15, +-21, +6, +2, +36, +-12, +-27, +-34, +9, +44, +8, +57, +20, +-22, +-22, +-11, +-15, +-8, +-15, +-9, +49, +37, +26, +6, +30, +-17, +-43, +-3, +-31, +36, +49, +-15, +8, +51, +-8, +17, +-51, +-16, +-21, +12, +49, +-11, +41, +36, +15, +-5, +-69, +60, +14, +-24, +-21, +23, +3, +41, +41, +-27, +0, +33, +63, +18, +1, +-36, +-20, +18, +7, +-1, +37, +-33, +30, +36, +-29, +38, +57, +-8, +-21, +-26, +15, +-10, +-4, +-17, +-19, +8, +-9, +-13, +0, +10, +-30, +59, +12, +11, +51, +60, +19, +35, +6, +3, +-5, +-19, +-10, +20, +6, +-12, +-5, +5, +11, +33, +-33, +-27, +-2, +-2, +3, +53, +-1, +-15, +-10, +13, +-5, +24, +-17, +10, +8, +-3, +41, +-9, +9, +0, +30, +7, +7, +-10, +-28, +21, +13, +4, +-3, +44, +9, +-49, +-31, +-51, +23, +16, +-13, +-12, +36, +23, +34, +-39, +8, +20, +2, +32, +73, +24, +-16, +0, +-30, +17, +52, +-10, +28, +-43, +-1, +-34, +16, +-4, +39, +31, +5, +5, +3, +36, +48, +5, +28, +12, +57, +25, +-28, +14, +34, +49, +11, +41, +20, +-12, +-18, +-43, +-29, +58, +26, +44, +36, +-100, +-46, +-24, +0, +54, +4, +-2, +21, +5, +-12, +22, +9, +-35, +-5, +-36, +-22, +-10, +-11, +-15, +34, +20, +-33, +17, +39, +-13, +-32, +-36, +-22, +-3, +-38, +-39, +-6, +0, +15, +-6, +-15, +9, +-10, +45, +16, +17, +13, +-8, +15, +11, +-4, +59, +18, +-9, +12, +69, +27, +-30, +16, +7, +30, +41, +-30, +9, +-4, +23, +-5, +8, +3, +-75, +-25, +5, +14, +12, +-21, +-21, +6, +-13, +15, +8, +-6, +30, +24, +41, +-12, +5, +39, +18, +-4, +-60, +19, +-1, +-3, +-7, +16, +-11, +-51, +2, +-10, +-17, +-32, +39, +-35, +-46, +10, +27, +1, +21, +13, +78, +-12, +-2, +-28, +-20, +11, +75, +85, +-18, +21, +-7, +-9, +33, +19, +71, +23, +25, +2, +47, +32, +8, +24, +46, +-22, +23, +18, +-58, +66, +52, +22, +32, +25, +-34, +-26, +-21, +-69, +6, +27, +-28, +-13, +-15, +10, +-20, +-16, +20, +25, +7, +22, +52, +-4, +-8, +-28, +-18, +-6, +-36, +6, +-26, +12, +32, +8, +11, +16, +0, +-18, +-33, +-10, +11, +-40, +-6, +-29, +-34, +-30, +15, +15, +-27, +9, +8, +-43, +0, +-5, +4, +12, +60, +-11, +-4, +-32, +-25, +-38, +-37, +-17, +-1, +20, +37, +43, +7, +-2, +12, +-7, +-7, +-14, +-30, +9, +50, +-18, +-9, +-1, +-9, +6, +16, +-38, +-13, +30, +}; + +#define SIZE 24 +__attribute__ ((section(".heapsram"))) char g_mA[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) char g_mB[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) char g_mC[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) char g_mB_tmp[SIZE][SIZE]; diff --git a/astral/redmule b/astral/redmule new file mode 120000 index 0000000..bd0da93 --- /dev/null +++ b/astral/redmule @@ -0,0 +1 @@ +../hwpe/redmule/ \ No newline at end of file diff --git a/astral/redmule_256iter b/astral/redmule_256iter new file mode 120000 index 0000000..e81b225 --- /dev/null +++ b/astral/redmule_256iter @@ -0,0 +1 @@ +../hwpe/redmule_256iter/ \ No newline at end of file diff --git a/astral/redmule_softclear b/astral/redmule_softclear new file mode 120000 index 0000000..2904f52 --- /dev/null +++ b/astral/redmule_softclear @@ -0,0 +1 @@ +../hwpe/redmule_softclear/ \ No newline at end of file diff --git a/astral/softex b/astral/softex new file mode 120000 index 0000000..e1a6d4b --- /dev/null +++ b/astral/softex @@ -0,0 +1 @@ +../hwpe/softex/ \ No newline at end of file diff --git a/carfield.yaml b/carfield.yaml new file mode 100644 index 0000000..751550e --- /dev/null +++ b/carfield.yaml @@ -0,0 +1,19 @@ +carfield: + parMatrixMul8: + path: ./carfield/parMatrixMul8 + command: make clean all run + parMatrixMul16: + path: ./carfield/parMatrixMul16 + command: make clean all run + parMatrixMul32: + path: ./carfield/parMatrixMul32 + command: make clean all run + dmr_matmul: + path: ./carfield/dmr_matmul + command: make clean all run + redmule: + path: ./carfield/redmule + command: make clean all run + neureka: + path: ./carfield/neureka + command: make clean all run \ No newline at end of file diff --git a/carfield/dmr_matmul/Makefile b/carfield/dmr_matmul/Makefile new file mode 100644 index 0000000..66ed60e --- /dev/null +++ b/carfield/dmr_matmul/Makefile @@ -0,0 +1,12 @@ +PULP_APP = test +PULP_APP_SRCS = dmr_matmul.c + +PULP_CFLAGS = -O3 +PULP_LDFLAGS = -lm + +ifeq ($(fault_inject),1) + export FAULT_INJECTION=1 + export FAULT_INJECTION_SCRIPT=$(CURDIR)/pulp_inject_fault.tcl +endif + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/carfield/dmr_matmul/dmr_matmul.c b/carfield/dmr_matmul/dmr_matmul.c new file mode 100644 index 0000000..953dfdf --- /dev/null +++ b/carfield/dmr_matmul/dmr_matmul.c @@ -0,0 +1,100 @@ +/* +* @Author: Michael Rogenmoser +* @Date: 2023-02-17 18:00:21 +* @Last Modified by: Michael Rogenmoser +* @Last Modified time: 2023-02-17 18:15:33 +*/ +#include +#include +#include "matmul.h" + +#define N_ITERS 1 +#define max(x,y) (x > y ? x : y) +#define min(x,y) (x < y ? x : y) + +__attribute__ ((section(".heapsram"))) int A[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) int B[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) int C[SIZE][SIZE]; + +void initialize_mat(); + +void initialize_mat() { + int i,j; + + for (i=0;i +#include +#include +#include + +#define SIZE 0x400 +#define NUM_BANKS 16 +#define SCRUBBER_INTERVAL 2 + +int main() { + // Collecting info about the core ID and the running cluster ID + unsigned int core_id = get_core_id(); + unsigned int cluster_id = rt_cluster_id(); + + if (rt_cluster_id() != 0) return bench_cluster_forward(0); + + if (core_id != 0) synch_barrier(); + + unsigned int *test_array = pi_l1_malloc(cluster_id, SIZE); + + // Initializing the memory + for (int i = 0; i < SIZE; i++) { + pulp_write32(&test_array[i], i); + } + + // Initialize the scrubbing interval for all memory banks + for (int i = 0; i < NUM_BANKS; i++) + tcdm_scrubber_set_interval(cluster_id, i, SCRUBBER_INTERVAL); + + // Initialize the error-tracking variables + bool mismatch = 0; + unsigned int error = 0; + for (int i = 0; i < SIZE; i++) { + mismatch = (pulp_read32(&test_array[i]) != i); + if (mismatch) { + error ++; + printf("Expected 0x%x, got 0x%x\n", i, pulp_read32(&test_array[i])); + } + } + + unsigned int mismatch_cnt = 0; + unsigned int fix_cnt = 0; + unsigned int uncorrectable_cnt = 0; + for (int i = 0; i < 16; i++) { + mismatch_cnt += tcdm_scrubber_get_mismatch_count(cluster_id, i); + fix_cnt += tcdm_scrubber_get_fix_count(cluster_id, i); + uncorrectable_cnt += tcdm_scrubber_get_uncorrectable_count(cluster_id, i); + } + + printf("mismatch_cnt: %d, fix_cnt: %d, uncorrectable_cnt: %d\n", mismatch_cnt, fix_cnt, uncorrectable_cnt); + + return (error != 0) && (uncorrectable_cnt == 0); +} diff --git a/carfield/ecc_test/pulp_inject_fault.tcl b/carfield/ecc_test/pulp_inject_fault.tcl new file mode 100644 index 0000000..45cef47 --- /dev/null +++ b/carfield/ecc_test/pulp_inject_fault.tcl @@ -0,0 +1,53 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 +# +# Author: Michael Rogenmoser (michaero@iis.ee.ethz.ch) + +transcript quietly +if {! [info exists ::env(VSIM_PATH)]} {error "Define VSIM_PATH"} +set utils_base_path [file join $::env(VSIM_PATH) scripts fault_injection_utils] +set script_base_path [file join $::env(VSIM_PATH) fault_injection_sim scripts] + +set verbosity 2 +set log_injections 1 +# Easy way to generate a variable seed +# set seed [clock seconds] +# Default value +set seed 12345 +set print_statistics 1 + +set inject_start_time 110584000000ps +set inject_stop_time 203880000000ps +set injection_clock "pulp_cluster_tb/cluster_i/clk_i" +set injection_clock_trigger 0 +set fault_period 100 +set rand_initial_injection_phase 0 +# max_num set to 0 means until stop_time +set max_num_fault_inject 0 +set signal_fault_duration 20ns +set register_fault_duration 0ns + +set allow_multi_bit_upset $::env(MULTI_BIT_UPSET) +set use_bitwidth_as_weight 0 +set check_core_output_modification 0 +set check_core_next_state_modification 0 +set reg_to_sig_ratio 1 + +source [file join $utils_base_path pulp_extract_nets.tcl] + +set inject_signals_netlist [] +set inject_register_netlist [] +set output_netlist [] +set next_state_netlist [] +set assertion_disable_list [] + +# for {set idx 0} {$idx < 12} {incr idx} { +# set inject_signals_netlist [list {*}$inject_signals_netlist {*}[get_all_core_nets $idx]] +# set output_netlist [list {*}$output_netlist {*}[get_core_output_nets $idx]] +# } + +set inject_register_netlist [list {*}$inject_register_netlist {*}[get_memory_slice {0 16} {385 449}]] + +source [file join $script_base_path inject_fault.tcl] + diff --git a/carfield/hello/Makefile b/carfield/hello/Makefile new file mode 100644 index 0000000..d145a0f --- /dev/null +++ b/carfield/hello/Makefile @@ -0,0 +1,5 @@ +PULP_APP = test +PULP_APP_SRCS = hello.c +PULP_CFLAGS = -O3 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/carfield/hello/hello.c b/carfield/hello/hello.c new file mode 100644 index 0000000..8741206 --- /dev/null +++ b/carfield/hello/hello.c @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +int main() +{ + printf("Hello !\n"); + + return 0; +} diff --git a/carfield/neureka b/carfield/neureka new file mode 120000 index 0000000..98cb979 --- /dev/null +++ b/carfield/neureka @@ -0,0 +1 @@ +../hwpe/neureka/ \ No newline at end of file diff --git a/carfield/parMatrixMul16/Makefile b/carfield/parMatrixMul16/Makefile new file mode 100755 index 0000000..0fade4a --- /dev/null +++ b/carfield/parMatrixMul16/Makefile @@ -0,0 +1,8 @@ +PULP_APP = test +PULP_APP_SRCS = matrixMul.c + +PULP_CFLAGS = -O3 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk + +#pulp-bench-reg --name=parMatrixMul16.cycles --module=pulp_rtl_testset --pipeline=$(PIPELINE) --artefact=pulp_rtl_testset --cmd="make run -f Makefile.sdk" --probe-regexp='matrixMul -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(16)" --probe-regexp='matrixMulTranspose -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(16),transposed" diff --git a/carfield/parMatrixMul16/gen_stimuli.py b/carfield/parMatrixMul16/gen_stimuli.py new file mode 100755 index 0000000..754a8a9 --- /dev/null +++ b/carfield/parMatrixMul16/gen_stimuli.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import sys +import random + + +def write_arr(f, name, arr): + f.write('const short %s[] = {\n' % name) + for v in arr: + f.write('%d,\n' % (v)) + f.write('};\n\n') + return + +################################################################################ +f = open('parMatrixMul16_stimuli.h', 'w') + + +SIZE = 24 +RANGE = int(2**7/SIZE) + +m_a = [] +m_b = [] +m_exp = [] + +for i in range(0,SIZE): + for j in range(0,SIZE): + a = random.randint(-RANGE, RANGE-1) + b = random.randint(-RANGE, RANGE-1) + + m_a.append(a) + m_b.append(b) + +for i in range(0,SIZE): + for j in range(0,SIZE): + r = 0 + + for k in range (0,SIZE): + r = r + m_a[i * SIZE + k] * m_b[k * SIZE + j] + + m_exp.append(r) + + +write_arr(f, 'm_a', m_a) +write_arr(f, 'm_b', m_b) +write_arr(f, 'm_exp', m_exp) + +f.write('#define SIZE %d\n' % SIZE) + + +f.write('__attribute__ ((section(".heapsram"))) short g_mA[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) short g_mB[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) short g_mC[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) short g_mB_tmp[SIZE][SIZE];\n') + diff --git a/carfield/parMatrixMul16/matrixMul.c b/carfield/parMatrixMul16/matrixMul.c new file mode 100644 index 0000000..5cbe65f --- /dev/null +++ b/carfield/parMatrixMul16/matrixMul.c @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Mantainer: Luca Valente, luca.valente2@unibo.it + */ + +#include "pulp.h" + +#include "parMatrixMul16_stimuli.h" + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()); +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()); + +testcase_t testcases[] = { + { .name = "matrixMul", .test = check_matrix_mul }, + { .name = "matrixMulTranspose", .test = check_matrix_mul_transpose }, + {0, 0} +}; + +unsigned int num_cores; + +int main() +{ + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + num_cores = get_core_num(); + + if(rt_core_id() < num_cores) { + run_suite(testcases); + } + + synch_barrier(); + + return 0; +} + +void matrix_init(); +unsigned int matrix_check(); + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB[k][j]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + // transpose array before using it + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mB_tmp[i][j] = g_mB[j][i]; + } + } + + if(num_cores != 1) synch_barrier(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB_tmp[j][k]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void matrix_init() { + unsigned int i, j; + + // init, copy to TCDM + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + g_mA[i][j] = m_a[i * SIZE + j]; + g_mB[i][j] = m_b[i * SIZE + j]; + g_mC[i][j] = 0; + } + } +} + +unsigned int matrix_check() { + unsigned int errors = 0; + unsigned int i, j; + // check + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + if(g_mC[i][j] != m_exp[i * SIZE + j]) { + printf("At index %d, %d\n", i, j, 0, 0); + errors++; + } + } + } + + return errors; +} diff --git a/carfield/parMatrixMul16/parMatrixMul16_stimuli.h b/carfield/parMatrixMul16/parMatrixMul16_stimuli.h new file mode 100644 index 0000000..2797565 --- /dev/null +++ b/carfield/parMatrixMul16/parMatrixMul16_stimuli.h @@ -0,0 +1,1742 @@ +const short m_a[] = { +3, +2, +-4, +0, +0, +0, +-5, +-3, +2, +4, +0, +2, +2, +-3, +2, +-1, +-2, +-1, +1, +4, +-3, +-3, +-1, +0, +4, +-2, +-1, +-4, +-1, +4, +1, +3, +-2, +-2, +-2, +-1, +3, +-5, +2, +0, +1, +-1, +-4, +4, +0, +-2, +4, +-5, +3, +-3, +-4, +3, +-2, +2, +-2, +3, +3, +4, +0, +-4, +4, +0, +3, +-5, +1, +0, +0, +2, +-3, +-2, +-3, +4, +-3, +1, +-5, +-5, +2, +-3, +-3, +-2, +-4, +0, +4, +1, +1, +0, +4, +0, +-3, +-3, +-1, +-2, +-5, +4, +-2, +1, +3, +-1, +-1, +-4, +4, +-1, +3, +-3, +2, +2, +3, +1, +2, +1, +2, +4, +-4, +-3, +-2, +-3, +4, +0, +1, +0, +-3, +0, +-2, +-3, +1, +3, +4, +-1, +4, +4, +3, +-4, +-5, +3, +2, +4, +-4, +4, +1, +-2, +2, +-1, +4, +2, +2, +-5, +0, +-2, +-4, +2, +-3, +-1, +4, +0, +4, +4, +1, +4, +4, +-3, +3, +4, +-2, +4, +-4, +2, +-2, +-3, +3, +1, +3, +3, +0, +-4, +1, +-4, +4, +2, +-2, +-5, +-4, +-4, +2, +-1, +-2, +0, +4, +-3, +4, +-5, +3, +-1, +-5, +-4, +-2, +-5, +4, +-2, +3, +-3, +4, +3, +-4, +4, +3, +-3, +4, +-1, +3, +-4, +-5, +1, +-3, +-4, +2, +3, +2, +-2, +4, +3, +2, +-4, +0, +-2, +-5, +-3, +3, +1, +3, +-5, +3, +1, +-5, +1, +-2, +-4, +1, +-5, +4, +1, +0, +0, +-4, +-2, +2, +0, +-3, +-4, +-1, +-5, +-2, +-4, +-2, +3, +4, +-5, +0, +2, +2, +4, +0, +4, +-5, +-3, +-5, +-5, +0, +0, +-4, +3, +1, +1, +-5, +2, +-5, +2, +-4, +3, +4, +-3, +-4, +4, +-2, +-4, +4, +-3, +-5, +-4, +2, +-2, +3, +-2, +1, +4, +1, +4, +-2, +1, +-4, +4, +2, +-1, +1, +1, +-2, +-2, +0, +-5, +2, +4, +-4, +1, +-5, +0, +4, +3, +-4, +1, +-3, +-4, +2, +-3, +-3, +1, +2, +4, +-3, +-2, +3, +-3, +-5, +4, +-5, +-5, +4, +4, +3, +-1, +-3, +-4, +-1, +1, +0, +1, +4, +-1, +0, +1, +-4, +0, +-4, +-3, +0, +-4, +2, +-2, +-2, +-1, +-2, +0, +-5, +-1, +-3, +3, +4, +-4, +4, +4, +0, +-4, +-3, +-3, +4, +0, +2, +3, +-1, +2, +0, +-4, +3, +-1, +-3, +-4, +4, +0, +4, +3, +0, +-3, +-5, +4, +-3, +3, +4, +3, +-1, +3, +-3, +1, +3, +4, +3, +-1, +3, +2, +4, +4, +-2, +-1, +2, +-4, +-4, +-1, +0, +-1, +4, +-4, +4, +2, +0, +0, +1, +-1, +-5, +1, +-5, +2, +-2, +2, +1, +3, +1, +-3, +-4, +-2, +0, +4, +-2, +0, +-1, +-1, +-3, +-2, +4, +3, +4, +2, +3, +4, +-4, +-2, +-1, +-2, +0, +-1, +-1, +-4, +-2, +2, +-5, +4, +-5, +0, +4, +4, +-2, +-4, +1, +-5, +-5, +4, +-3, +0, +-1, +-5, +4, +-4, +0, +4, +4, +-4, +-2, +-4, +1, +-4, +-5, +4, +-5, +-3, +2, +-2, +-1, +-3, +0, +-3, +-2, +4, +-5, +-5, +3, +-5, +-1, +3, +2, +3, +0, +-2, +-3, +-4, +3, +0, +3, +3, +-1, +-4, +0, +3, +-5, +-3, +4, +-3, +2, +-1, +0, +2, +-1, +3, +-1, +-5, +3, +4, +3, +3, +4, +3, +1, +-4, +1, +-2, +4, +-3, +2, +0, +-2, +-5, +-5, +-2, +-2, +0, +4, +0, +-4, +-1, +0, +-5, +-3, +4, +0, +1, +-3, +3, +-2, +-2, +2, +-1, +-1, +-3, +-1, +-2, +-5, +-5, +-1, +-1, +-4, +-4, +}; + +const short m_b[] = { +-2, +3, +2, +-5, +-3, +2, +0, +-5, +1, +-3, +-2, +-3, +-3, +3, +-4, +-2, +-4, +1, +1, +1, +4, +4, +3, +3, +-4, +-3, +-2, +-2, +-3, +-2, +-1, +-5, +-4, +1, +-4, +-3, +3, +-5, +-5, +-4, +-2, +-2, +1, +3, +3, +3, +4, +0, +1, +0, +4, +3, +-3, +3, +-1, +0, +2, +-1, +-1, +0, +0, +3, +-4, +4, +3, +3, +-1, +-4, +-1, +-4, +-2, +-4, +-4, +3, +0, +3, +-4, +2, +0, +0, +-4, +-5, +-4, +4, +0, +-4, +-2, +-4, +3, +4, +-4, +-3, +-2, +0, +-2, +2, +-5, +2, +1, +2, +0, +0, +-2, +-2, +-1, +1, +-1, +3, +3, +-1, +3, +-1, +-2, +1, +-3, +3, +3, +3, +2, +0, +1, +-2, +1, +4, +3, +-2, +-5, +0, +2, +0, +1, +-3, +-1, +3, +-3, +-2, +3, +4, +-2, +-1, +-5, +1, +-1, +2, +-3, +-4, +0, +-1, +0, +2, +3, +4, +-4, +-1, +-1, +2, +-1, +-5, +1, +-3, +0, +-4, +0, +-1, +2, +-5, +-1, +1, +-2, +-5, +-3, +-3, +2, +2, +-4, +2, +-5, +-4, +4, +4, +-1, +1, +2, +-1, +-3, +0, +2, +4, +-3, +-2, +-1, +4, +-5, +4, +0, +-1, +-1, +-4, +-3, +-3, +-1, +-2, +-1, +-2, +-5, +2, +2, +-3, +1, +-1, +-2, +0, +0, +1, +0, +4, +-4, +-4, +-4, +-5, +4, +-4, +4, +-3, +4, +-5, +-2, +3, +-2, +-4, +1, +2, +-5, +-2, +-5, +-3, +2, +-1, +3, +-4, +-4, +4, +-3, +-5, +0, +-3, +2, +2, +3, +-3, +-3, +-2, +-5, +4, +1, +3, +-1, +-4, +3, +1, +-2, +1, +3, +4, +1, +-2, +0, +0, +-3, +-1, +1, +-1, +0, +-5, +-2, +-1, +1, +-2, +4, +-3, +2, +1, +0, +0, +3, +-4, +2, +1, +0, +3, +-5, +-1, +4, +3, +4, +-2, +-1, +-3, +-4, +3, +3, +-4, +4, +-5, +1, +-1, +0, +3, +4, +3, +-3, +-5, +-3, +-2, +-4, +1, +-2, +-3, +-2, +1, +2, +4, +4, +0, +2, +4, +0, +-1, +1, +-4, +-1, +-3, +-2, +-5, +3, +-3, +-1, +0, +-3, +-3, +-4, +3, +3, +-5, +-2, +-1, +3, +1, +1, +1, +3, +1, +1, +-5, +-2, +2, +4, +3, +-3, +-5, +3, +-4, +0, +2, +-4, +-5, +-3, +-3, +-3, +2, +-1, +-3, +1, +-3, +-1, +-5, +2, +1, +1, +-3, +2, +4, +1, +-5, +3, +0, +-5, +-4, +2, +-5, +3, +-1, +0, +2, +-3, +-1, +-2, +-5, +-4, +0, +-5, +-3, +3, +1, +0, +-1, +-4, +1, +-4, +-5, +-1, +3, +3, +-3, +-1, +4, +-1, +-5, +-4, +-1, +0, +1, +1, +4, +-3, +-2, +4, +1, +-4, +-1, +-3, +2, +-2, +4, +2, +-1, +2, +-4, +-3, +4, +-1, +1, +-4, +0, +1, +-4, +4, +-4, +-5, +-5, +-3, +2, +2, +-1, +0, +-2, +-5, +1, +-1, +-5, +2, +-4, +-5, +-3, +1, +-4, +0, +1, +-5, +0, +-2, +1, +-4, +-5, +3, +0, +1, +2, +4, +-1, +-4, +0, +-4, +0, +-5, +3, +-5, +-5, +2, +2, +3, +-3, +4, +4, +2, +3, +-2, +4, +3, +4, +1, +1, +0, +2, +2, +4, +-1, +-5, +-5, +3, +-5, +-2, +-4, +-4, +0, +4, +3, +0, +4, +-3, +4, +0, +-1, +3, +2, +-4, +2, +-1, +-3, +-5, +-3, +3, +-4, +1, +-3, +3, +-4, +0, +3, +-2, +-1, +2, +-4, +2, +4, +0, +2, +-2, +-5, +0, +-1, +-5, +4, +-4, +3, +-1, +4, +1, +4, +0, +-1, +1, +4, +3, +3, +1, +2, +-3, +-2, +-5, +-2, +1, +-5, +-4, +-5, +-2, +}; + +const short m_exp[] = { +-32, +48, +2, +-56, +46, +-52, +39, +-77, +41, +-50, +-50, +-14, +-45, +-31, +3, +-22, +-52, +-31, +-32, +8, +61, +97, +49, +-12, +12, +-25, +-26, +-15, +15, +45, +-29, +-27, +-7, +-23, +30, +-36, +-56, +27, +-59, +35, +6, +44, +72, +23, +17, +86, +-8, +5, +-25, +41, +-30, +-78, +120, +-1, +69, +-20, +24, +-66, +6, +68, +-59, +2, +27, +-21, +-71, +-61, +-49, +21, +-24, +27, +-47, +29, +71, +37, +-9, +-58, +13, +13, +73, +-9, +7, +28, +23, +35, +46, +-8, +86, +-7, +25, +-65, +-18, +43, +58, +22, +44, +-51, +-33, +10, +-18, +-17, +-62, +24, +18, +-68, +23, +25, +23, +-32, +23, +37, +55, +-16, +24, +4, +-16, +9, +77, +44, +48, +-36, +-81, +-100, +-18, +36, +-3, +-72, +-43, +38, +-2, +34, +53, +-42, +-18, +56, +19, +34, +3, +-58, +-16, +-34, +-71, +-12, +12, +-22, +-9, +57, +-46, +-70, +25, +-62, +10, +-5, +68, +-86, +-15, +-19, +-100, +78, +23, +47, +5, +-64, +31, +-16, +-9, +-22, +11, +30, +-74, +31, +50, +38, +-59, +24, +12, +-18, +4, +33, +-24, +-44, +-19, +2, +-47, +48, +-30, +28, +-38, +-29, +20, +52, +-21, +-5, +49, +-41, +-17, +-2, +68, +-2, +122, +-29, +13, +23, +46, +40, +-12, +-58, +59, +2, +15, +-49, +-38, +-15, +46, +-1, +-54, +-73, +10, +-1, +43, +80, +-55, +119, +82, +22, +-12, +-14, +-12, +-3, +36, +27, +-3, +50, +59, +61, +13, +-14, +5, +51, +-54, +-45, +-9, +109, +10, +-28, +1, +0, +-50, +-23, +27, +49, +16, +3, +23, +43, +31, +29, +-27, +-45, +9, +61, +42, +34, +30, +26, +38, +-84, +5, +98, +13, +55, +15, +56, +63, +22, +80, +10, +52, +52, +49, +101, +37, +71, +-20, +-38, +-36, +-72, +-40, +12, +77, +-19, +32, +9, +-15, +85, +-13, +-33, +-8, +-20, +33, +36, +76, +6, +67, +-51, +36, +76, +-48, +30, +42, +-45, +-22, +-9, +65, +-29, +18, +89, +34, +-25, +41, +30, +26, +49, +34, +-6, +-28, +10, +-106, +6, +86, +5, +-3, +-105, +-103, +-45, +-55, +-60, +29, +41, +54, +-29, +36, +-30, +-8, +25, +36, +40, +22, +-35, +-50, +33, +3, +-19, +-30, +3, +-11, +9, +-4, +-18, +39, +92, +-25, +0, +-40, +64, +-18, +58, +-2, +27, +-89, +32, +-2, +56, +79, +-67, +87, +-41, +20, +29, +4, +51, +45, +26, +-54, +8, +-37, +-37, +-39, +-35, +-36, +-23, +-30, +11, +1, +-63, +-21, +5, +13, +35, +-15, +27, +-59, +-10, +29, +4, +-58, +-56, +3, +-10, +27, +0, +29, +-4, +-17, +28, +2, +17, +9, +-27, +53, +-19, +-36, +104, +25, +38, +84, +-29, +42, +-10, +-47, +-36, +1, +10, +43, +-46, +-57, +38, +44, +-9, +26, +-15, +10, +31, +34, +-18, +17, +24, +-3, +5, +11, +4, +23, +5, +-19, +51, +-59, +-29, +62, +-27, +35, +81, +65, +-10, +-1, +-1, +45, +20, +26, +-34, +-7, +35, +-7, +2, +19, +115, +-32, +-40, +-92, +47, +-65, +23, +53, +-31, +22, +20, +45, +91, +64, +29, +49, +30, +74, +-15, +22, +120, +16, +122, +-51, +51, +78, +60, +-53, +40, +-49, +73, +-2, +14, +-17, +-86, +35, +-25, +36, +2, +5, +38, +-9, +16, +-12, +-41, +-24, +-31, +-72, +-68, +-9, +5, +11, +-63, +15, +12, +21, +22, +19, +62, +-28, +122, +12, +28, +-59, +-19, +49, +57, +-12, +29, +-1, +-23, +126, +75, +-29, +-21, +-37, +8, +-70, +-8, +91, +-34, +-10, +31, +78, +44, +59, +23, +66, +42, +38, +6, +19, +10, +1, +57, +38, +71, +-10, +-12, +-7, +-13, +-15, +6, +}; + +#define SIZE 24 +__attribute__ ((section(".heapsram"))) short g_mA[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) short g_mB[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) short g_mC[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) short g_mB_tmp[SIZE][SIZE]; diff --git a/carfield/parMatrixMul32/Makefile b/carfield/parMatrixMul32/Makefile new file mode 100755 index 0000000..bd55e15 --- /dev/null +++ b/carfield/parMatrixMul32/Makefile @@ -0,0 +1,8 @@ +PULP_APP = test +PULP_APP_SRCS = matrixMul.c + +PULP_CFLAGS = -O3 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk + +#pulp-bench-reg --name=parMatrixMul32.cycles --module=pulp_rtl_testset --pipeline=$(PIPELINE) --artefact=pulp_rtl_testset --cmd="make run -f Makefile.sdk" --probe-regexp='matrixMul -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(32)" --probe-regexp='matrixMulTranspose -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(32),transposed" diff --git a/carfield/parMatrixMul32/gen_stimuli.py b/carfield/parMatrixMul32/gen_stimuli.py new file mode 100755 index 0000000..32926e9 --- /dev/null +++ b/carfield/parMatrixMul32/gen_stimuli.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import sys +import random + + +def write_arr(f, name, arr): + f.write('const int %s[] = {\n' % name) + for v in arr: + f.write('%d,\n' % (v)) + f.write('};\n\n') + return + +################################################################################ +f = open('parMatrixMul32_stimuli.h', 'w') + + +SIZE = 24 +RANGE = int(2**15/SIZE) + +m_a = [] +m_b = [] +m_exp = [] + +for i in range(0,SIZE): + for j in range(0,SIZE): + a = random.randint(-RANGE, RANGE-1) + b = random.randint(-RANGE, RANGE-1) + + m_a.append(a) + m_b.append(b) + +for i in range(0,SIZE): + for j in range(0,SIZE): + r = 0 + + for k in range (0,SIZE): + r = r + m_a[i * SIZE + k] * m_b[k * SIZE + j] + + m_exp.append(r) + + +write_arr(f, 'm_a', m_a) +write_arr(f, 'm_b', m_b) +write_arr(f, 'm_exp', m_exp) + +f.write('#define SIZE %d\n' % SIZE) + + +f.write('__attribute__ ((section(".heapsram"))) int g_mA[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) int g_mB[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) int g_mC[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) int g_mB_tmp[SIZE][SIZE];\n') + diff --git a/carfield/parMatrixMul32/matrixMul.c b/carfield/parMatrixMul32/matrixMul.c new file mode 100644 index 0000000..990d411 --- /dev/null +++ b/carfield/parMatrixMul32/matrixMul.c @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Mantainer: Luca Valente, luca.valente2@unibo.it + */ + +#include "pulp.h" + +#include "parMatrixMul32_stimuli.h" + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()); +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()); + +testcase_t testcases[] = { + { .name = "matrixMul", .test = check_matrix_mul }, + { .name = "matrixMulTranspose", .test = check_matrix_mul_transpose }, + {0, 0} +}; + +unsigned int num_cores; + +int main() +{ + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + num_cores = get_core_num(); + + if(rt_core_id() < num_cores) { + run_suite(testcases); + } + + synch_barrier(); + + return 0; +} + +void matrix_init(); +unsigned int matrix_check(); + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB[k][j]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + // transpose array before using it + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mB_tmp[i][j] = g_mB[j][i]; + } + } + + if(num_cores != 1) synch_barrier(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB_tmp[j][k]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void matrix_init() { + unsigned int i, j; + + // init, copy to TCDM + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + g_mA[i][j] = m_a[i * SIZE + j]; + g_mB[i][j] = m_b[i * SIZE + j]; + g_mC[i][j] = 0; + } + } +} + +unsigned int matrix_check() { + unsigned int errors = 0; + unsigned int i, j; + // check + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + if(g_mC[i][j] != m_exp[i * SIZE + j]) { + printf("At index %d, %d\n", i, j, 0, 0); + errors++; + } + } + } + + return errors; +} diff --git a/carfield/parMatrixMul32/parMatrixMul32_stimuli.h b/carfield/parMatrixMul32/parMatrixMul32_stimuli.h new file mode 100644 index 0000000..d5c1af0 --- /dev/null +++ b/carfield/parMatrixMul32/parMatrixMul32_stimuli.h @@ -0,0 +1,1742 @@ +const int m_a[] = { +-1344, +-778, +-350, +1240, +950, +940, +1262, +285, +738, +-37, +257, +704, +87, +-1135, +1175, +960, +984, +611, +-1170, +1243, +-729, +-1235, +-1229, +8, +52, +1156, +-804, +-129, +-112, +-307, +1045, +-954, +-944, +477, +1104, +-600, +-31, +1364, +-950, +-153, +967, +-446, +235, +-197, +763, +-660, +-1289, +116, +-458, +994, +1086, +156, +-549, +102, +-532, +943, +299, +340, +516, +1117, +515, +1335, +708, +-306, +-589, +517, +655, +-279, +-595, +-255, +-1104, +-16, +507, +199, +-297, +-905, +-1084, +982, +20, +-1364, +407, +333, +-798, +-711, +1301, +175, +490, +-165, +-1097, +1251, +1203, +-884, +419, +-1262, +-950, +-1200, +759, +-205, +-1365, +-870, +109, +-1135, +36, +796, +-1233, +-1117, +-826, +241, +588, +-513, +-353, +791, +1071, +-1073, +220, +-1094, +1096, +-723, +280, +-505, +151, +399, +319, +-1120, +-213, +-966, +679, +497, +-290, +-300, +-290, +-599, +243, +-752, +604, +1196, +-715, +-177, +-329, +1337, +596, +1027, +509, +-301, +-1211, +-936, +-281, +446, +-356, +841, +-1123, +-1343, +-140, +-1300, +-828, +-237, +1206, +1274, +-1092, +-922, +913, +1201, +-422, +782, +-230, +633, +-1055, +-1160, +329, +1255, +1038, +770, +123, +934, +670, +-492, +-24, +-779, +-1129, +47, +555, +1214, +-232, +-716, +-322, +-126, +178, +827, +710, +-1057, +-313, +996, +1149, +-532, +570, +1171, +899, +-313, +-790, +1071, +154, +-303, +492, +-918, +-1139, +292, +129, +1347, +-309, +751, +1262, +142, +-1062, +-1305, +250, +657, +238, +-141, +1308, +-37, +-514, +-591, +-611, +852, +-653, +-640, +91, +254, +-1145, +-1263, +-838, +-10, +266, +-444, +1129, +762, +-713, +-326, +-88, +1063, +-442, +-177, +365, +-740, +-1219, +1085, +783, +-725, +-1112, +426, +660, +6, +-440, +513, +687, +1078, +212, +-434, +-953, +1337, +160, +622, +-950, +-943, +288, +-136, +-1103, +-223, +1271, +211, +251, +-271, +-26, +704, +1177, +544, +699, +-885, +-864, +-1280, +877, +-461, +995, +-623, +-121, +-146, +-484, +-225, +-978, +163, +-278, +-502, +-505, +-567, +-771, +1279, +699, +-1337, +544, +1145, +1271, +640, +277, +-164, +458, +-1280, +-602, +-2, +1136, +1203, +-699, +-195, +659, +-472, +1230, +1151, +-97, +-77, +-772, +-381, +-295, +636, +-1341, +-445, +-806, +531, +-1186, +-1313, +-274, +835, +-446, +558, +-1307, +-235, +43, +-254, +-109, +911, +-1189, +559, +-854, +-218, +149, +580, +1158, +-14, +181, +-1120, +-947, +-542, +1142, +631, +-893, +-614, +-257, +-365, +-951, +1, +-762, +268, +382, +-131, +808, +-234, +839, +346, +-733, +1251, +496, +-566, +-751, +581, +-1292, +1068, +-932, +-855, +1336, +-280, +523, +1294, +-1251, +1284, +-1276, +87, +1264, +-274, +-922, +-289, +-458, +-117, +196, +-79, +-707, +1233, +-385, +-620, +-617, +703, +-995, +-374, +660, +145, +821, +1289, +582, +-201, +447, +116, +759, +-615, +834, +268, +-1114, +-1016, +-227, +-589, +-910, +-244, +-660, +764, +219, +1165, +506, +-673, +799, +-1355, +-872, +491, +689, +176, +-285, +1151, +1080, +-319, +286, +833, +217, +-621, +478, +539, +-109, +-1273, +-564, +-240, +504, +518, +256, +-124, +74, +949, +-912, +-1341, +965, +-774, +634, +1009, +1304, +200, +-1041, +-1262, +-865, +-1065, +-635, +-357, +-928, +806, +1148, +-411, +56, +686, +-644, +1241, +-430, +297, +127, +457, +-1313, +741, +861, +220, +-540, +772, +265, +1066, +679, +177, +-734, +29, +-149, +181, +-1042, +-1139, +271, +-326, +-29, +1298, +643, +-890, +-136, +-1015, +-565, +-964, +894, +-312, +698, +159, +-222, +-1322, +578, +945, +1124, +1278, +54, +-389, +1101, +362, +-543, +380, +959, +-399, +-1105, +1308, +338, +-198, +-1111, +-278, +-752, +668, +1156, +-1226, +579, +184, +-1084, +-917, +-498, +-466, +316, +-788, +-718, +468, +367, +-1333, +-1146, +828, +1329, +311, +-1346, +54, +-976, +854, +-658, +-198, +-979, +156, +385, +-659, +1326, +1351, +-1173, +-648, +720, +-40, +313, +729, +-416, +351, +452, +-413, +-4, +-1113, +-612, +-28, +-721, +400, +1072, +-1010, +}; + +const int m_b[] = { +-1316, +319, +963, +-608, +519, +-783, +-676, +181, +172, +203, +-1351, +-935, +-12, +758, +-746, +1226, +127, +-1346, +1251, +-377, +889, +-23, +-417, +-122, +680, +1363, +729, +-907, +-1263, +-431, +363, +1355, +-566, +-517, +-1186, +1318, +-1104, +-1245, +950, +687, +252, +-270, +1081, +-1290, +656, +8, +60, +1171, +915, +-500, +678, +-953, +307, +-35, +-1334, +-888, +598, +1160, +722, +850, +-268, +988, +635, +-340, +252, +1208, +420, +82, +1283, +-319, +-666, +172, +583, +174, +471, +-1063, +452, +-191, +-1188, +116, +-927, +1086, +119, +-245, +-717, +-657, +417, +319, +1133, +1338, +141, +-546, +567, +-1089, +-191, +-1138, +-201, +-1286, +-820, +-1356, +1177, +-317, +191, +67, +164, +-306, +-1015, +1147, +-482, +1229, +-259, +-207, +1309, +847, +-399, +-1005, +-995, +140, +-567, +-1220, +-427, +180, +-571, +997, +-783, +-316, +-1360, +736, +75, +-1251, +-307, +-902, +1181, +1057, +-141, +-1098, +776, +1096, +-923, +914, +1049, +-28, +-742, +-804, +-467, +567, +329, +-309, +-161, +-157, +-430, +-639, +1138, +-165, +292, +-20, +777, +-715, +60, +-1359, +35, +307, +-1092, +271, +548, +822, +-50, +-475, +-103, +784, +537, +152, +-517, +-1097, +117, +-619, +538, +941, +172, +-223, +1161, +-1004, +-1145, +-455, +255, +363, +859, +403, +-861, +-657, +-537, +-1084, +-1042, +541, +1283, +-356, +1298, +-1254, +-303, +203, +104, +1123, +-72, +-171, +-1122, +-533, +440, +275, +613, +-846, +-189, +884, +704, +-570, +-440, +-1157, +-200, +-80, +616, +799, +757, +-264, +-1256, +-690, +152, +184, +-810, +-221, +-821, +-243, +508, +-709, +574, +-693, +315, +-952, +952, +697, +875, +-480, +-691, +422, +-413, +-1199, +441, +-751, +821, +1303, +-410, +416, +566, +-131, +-551, +46, +978, +-228, +1117, +-251, +-537, +874, +-882, +260, +-213, +248, +-1296, +1343, +-626, +-812, +629, +-601, +-378, +-1314, +-889, +774, +-307, +692, +-1125, +-692, +923, +947, +1158, +-939, +1284, +35, +1299, +369, +-8, +43, +768, +524, +137, +659, +285, +-1315, +-457, +871, +-768, +1107, +-695, +488, +-527, +-161, +414, +-526, +-1164, +1059, +-1108, +560, +-622, +898, +-50, +-286, +-170, +513, +952, +433, +237, +584, +-665, +-960, +585, +-434, +1223, +-130, +1035, +430, +202, +1312, +1152, +1059, +-1082, +-1295, +805, +-18, +613, +-94, +557, +548, +1354, +116, +289, +-1358, +-1234, +1237, +451, +820, +-102, +974, +832, +-1019, +914, +-512, +-267, +1329, +-910, +-1341, +862, +-381, +-23, +-658, +40, +-71, +-782, +1240, +-956, +1241, +-291, +-884, +1250, +699, +834, +190, +960, +-1260, +177, +464, +155, +-1105, +768, +424, +621, +740, +-1357, +1186, +-594, +1329, +829, +126, +1101, +1146, +-95, +605, +-673, +1334, +440, +-10, +12, +-745, +20, +19, +-793, +999, +1083, +487, +-657, +-356, +654, +-326, +-250, +-718, +-947, +-235, +558, +974, +-981, +637, +-861, +-768, +1045, +-583, +-910, +128, +734, +896, +-1156, +223, +284, +272, +634, +-473, +363, +359, +-1185, +14, +-33, +-1122, +140, +900, +439, +-944, +-770, +663, +865, +1056, +-238, +86, +-1294, +-44, +-603, +602, +20, +397, +-423, +-703, +-209, +-906, +-1236, +945, +-737, +578, +904, +645, +1225, +-877, +-425, +-493, +-1326, +424, +965, +1300, +-1210, +823, +1345, +626, +-427, +592, +-869, +-1055, +-938, +-427, +1066, +472, +-1055, +48, +-1200, +-349, +313, +-1227, +-228, +783, +839, +187, +1021, +-1355, +1284, +68, +-1321, +-997, +1286, +-887, +772, +-156, +-105, +1329, +1141, +-377, +-881, +-341, +1316, +-391, +-1249, +-205, +53, +-266, +-540, +-289, +-1011, +602, +-1032, +-1097, +-202, +-467, +-1047, +-867, +-340, +-109, +-496, +967, +1147, +108, +384, +-12, +1216, +137, +1318, +151, +219, +-543, +391, +668, +-1348, +-1244, +-810, +-676, +321, +-1258, +1343, +1214, +791, +35, +1219, +1278, +1037, +-1282, +661, +585, +921, +-880, +-989, +-1192, +-207, +273, +-382, +690, +165, +271, +-212, +739, +-343, +-42, +226, +40, +859, +-153, +622, +-1059, +}; + +const int m_exp[] = { +-4942391, +2289133, +-1363225, +1978230, +1580032, +-625813, +-3230128, +2236653, +19494, +3242695, +-1080745, +-34154, +4086860, +-1370876, +3997221, +-1812380, +4705498, +7690207, +-4068140, +3595067, +-1103308, +-939857, +-4249710, +-8650816, +-2013119, +2933624, +235162, +-453807, +-4447391, +3527041, +2046492, +411956, +-994117, +-1411344, +1333704, +-519761, +3026373, +-564969, +3749147, +2447173, +-557628, +1138674, +-1426096, +-4033488, +-1829685, +2815607, +2382958, +1714081, +-1470484, +3379876, +3660759, +-2439960, +-1180478, +-3300785, +-5104533, +-309753, +-1667400, +3258850, +1805449, +2481948, +-944985, +-363123, +4227063, +3022289, +2763211, +5114077, +-1534394, +-2957168, +3401637, +-1195822, +-747480, +-2915318, +-2505013, +-174927, +276733, +2899369, +-6702856, +923396, +-2741169, +4270685, +-1020657, +-2562887, +2074098, +-2382784, +1366504, +691209, +4127820, +400356, +-7415505, +823772, +-3848400, +-158560, +3759990, +2298445, +323394, +873625, +474364, +2617120, +-1382444, +1735284, +-5799715, +1915577, +7016057, +-1212904, +919286, +2949768, +1228832, +572192, +4145710, +-6809520, +-2199597, +677764, +-7169579, +-4904277, +6902014, +231123, +-4797299, +3093608, +989455, +4324476, +3121268, +810907, +-2457323, +2299211, +-1625774, +-141013, +3343022, +-2044657, +4089375, +-291323, +-1950307, +2480885, +2846731, +-2139146, +-2718414, +-1997531, +-2399245, +-4060224, +423228, +-205276, +-1602384, +910872, +-3535114, +6008729, +-559984, +-301205, +-5407307, +2981269, +1079061, +6602535, +-857708, +2756391, +-5304566, +-3769267, +2620777, +-4409088, +502077, +7568647, +-37918, +2315061, +-2540065, +8296540, +-7465282, +1553910, +-4736227, +-2139045, +95614, +342546, +1309722, +1777391, +2981296, +-736899, +-2572111, +-972463, +-2793724, +-2893912, +-1230264, +-871649, +-1439985, +3074445, +3339004, +2423842, +1751086, +4064832, +1550243, +6225792, +-503750, +-2567772, +5671219, +-2054796, +-551487, +-2787790, +-3835027, +-4272806, +2725813, +-2982521, +1803437, +3024675, +-201092, +-1626608, +1548043, +2303810, +3032912, +398283, +1704371, +1860306, +-4101665, +5187913, +4233418, +2054883, +3603470, +1935132, +-17548, +-4362444, +-2806918, +-5651039, +-1853372, +-1707208, +-153048, +-2791834, +402265, +2815962, +3391662, +-4833520, +-1190520, +2302448, +80738, +2089586, +174096, +2837490, +-5514606, +2138871, +336249, +-378675, +5833977, +4367000, +-2445147, +-3652299, +-1794451, +-1471577, +-1263012, +3719274, +-3404819, +5765304, +-4256415, +3558206, +-1884441, +-475244, +3659623, +-2914867, +689238, +-2576754, +7739914, +1823902, +2077002, +-2365242, +2023481, +1663749, +-4973435, +-694558, +1118078, +2260786, +-3256285, +4596746, +-5421599, +594942, +-1730692, +-4626077, +3077882, +-2232009, +2672161, +3135747, +-4602601, +859784, +-3530668, +21600, +-4690786, +2023164, +-496745, +-2728919, +281474, +-108745, +-809613, +1445687, +-5781458, +2097169, +1594266, +-4504019, +2460482, +6259537, +-700848, +413263, +-1212884, +-5695130, +2094147, +-750529, +-1379008, +6029072, +722889, +1719449, +1682336, +-4219755, +1971162, +66575, +-1195119, +141466, +-1083536, +-718558, +-4039954, +-168429, +-2026861, +2025800, +-761083, +-4194692, +2013337, +-1156936, +3823019, +4081732, +-3145845, +-1733615, +-1371947, +-3811245, +-1584663, +-3547009, +-3267886, +8255291, +-3232160, +3404636, +3248369, +3233853, +671601, +-1009897, +1821121, +-3517645, +2005444, +-2768741, +115998, +533867, +4717709, +1315923, +-3510545, +-3539595, +-538461, +4529529, +2792584, +-107486, +-1840413, +-1474849, +1579605, +-4197602, +-34825, +-462678, +1294881, +-1730927, +-2549709, +-1531672, +-271859, +-1181904, +-1680154, +-2321723, +-6641222, +1127764, +893535, +-2804646, +5653509, +2657606, +-1751466, +-4669812, +-827592, +-126901, +-2599752, +-845148, +1390838, +8975481, +-7663778, +3572438, +5920790, +5233883, +-613590, +-881500, +-3974422, +-5523348, +-3243204, +-6405765, +-4376438, +1352634, +-105650, +2650174, +1442151, +5088231, +2974595, +-4501663, +-841006, +-3101819, +-1265401, +-2756903, +2579743, +2045040, +-5328835, +2801176, +-386694, +-3068782, +3147225, +-248211, +-662659, +-1112717, +2733193, +336344, +3107302, +2244003, +4285762, +1998904, +1888720, +-1174981, +-2567532, +-5588952, +-101948, +4004848, +-610048, +793760, +3345423, +716318, +1033698, +4011882, +-965219, +1258434, +1579522, +-4249500, +3233648, +-424838, +2640541, +1020028, +4933599, +-1964947, +3237309, +-1251962, +-437406, +-2749192, +-2943112, +-117113, +778507, +2757711, +3478291, +-661571, +1077087, +-3821174, +2731860, +3035264, +-4424379, +295413, +3873542, +-1272809, +4145370, +-363272, +2240544, +88954, +-2016552, +-862779, +-844808, +3142493, +2019692, +3648148, +3857820, +593190, +1285134, +-4257140, +-1476035, +-1951773, +-2334649, +1355368, +-4390456, +3666652, +562848, +-8226958, +1134896, +1136697, +-2132899, +3300228, +1855661, +6476864, +5097743, +-1373818, +3287769, +1709294, +-2926119, +2463141, +400199, +3051372, +1815531, +1746372, +398117, +2333959, +-708565, +-4241370, +51697, +-1626217, +26865, +2248300, +3357859, +-325912, +194201, +612298, +388227, +256018, +-5630155, +-1085451, +653494, +-1966315, +-273079, +-4296295, +-2813232, +2079672, +2378463, +-3869089, +-438799, +-725265, +3152791, +3461913, +-777750, +47521, +2588203, +1888001, +-4445421, +654349, +811737, +418334, +-1854075, +-5194402, +-1571674, +-622026, +-1091628, +1787463, +3439585, +2923276, +-1997884, +-193963, +-731696, +3686658, +-1311796, +-5219031, +-2906251, +2140229, +-1846978, +2541247, +-3677377, +-3935140, +3605308, +4807232, +-1633864, +344286, +-2051894, +2498349, +-3085, +379207, +-701595, +-1080351, +3161365, +-1606976, +1640595, +3757649, +798095, +-3167055, +-2288739, +2301831, +-3324819, +3219538, +516049, +-3153835, +7342606, +1098913, +-2522436, +376783, +47367, +530901, +-395499, +304200, +}; + +#define SIZE 24 +__attribute__ ((section(".heapsram"))) int g_mA[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) int g_mB[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) int g_mC[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) int g_mB_tmp[SIZE][SIZE]; diff --git a/carfield/parMatrixMul8/Makefile b/carfield/parMatrixMul8/Makefile new file mode 100755 index 0000000..7c755b8 --- /dev/null +++ b/carfield/parMatrixMul8/Makefile @@ -0,0 +1,8 @@ +PULP_APP = test +PULP_APP_SRCS = matrixMul.c + +PULP_CFLAGS = -O3 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk + +#pulp-bench-reg --name=parMatrixMul8.cycles --module=pulp_rtl_testset --pipeline=$(PIPELINE) --artefact=pulp_rtl_testset --cmd="make run -f Makefile.sdk" --probe-regexp='matrixMul -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(8)" --probe-regexp='matrixMulTranspose -> success, nr. of errors: 0, execution time: (\d+)' --params="platform($(platformName)),compiler($(OR1K_TOOLCHAIN_TYPE)),nbCores(4),elemSize(8),transposed" diff --git a/carfield/parMatrixMul8/gen_stimuli.py b/carfield/parMatrixMul8/gen_stimuli.py new file mode 100755 index 0000000..153d5c3 --- /dev/null +++ b/carfield/parMatrixMul8/gen_stimuli.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import sys +import random + + +def write_arr(f, name, arr): + f.write('const char %s[] = {\n' % name) + for v in arr: + f.write('%d,\n' % (v)) + f.write('};\n\n') + return + +################################################################################ +f = open('parMatrixMul8_stimuli.h', 'w') + + +SIZE = 24 +RANGE = 4 + +m_a = [] +m_b = [] +m_exp = [] + +for i in range(0,SIZE): + for j in range(0,SIZE): + a = random.randint(-RANGE, RANGE-1) + b = random.randint(-RANGE, RANGE-1) + + m_a.append(a) + m_b.append(b) + +for i in range(0,SIZE): + for j in range(0,SIZE): + r = 0 + + for k in range (0,SIZE): + r = r + m_a[i * SIZE + k] * m_b[k * SIZE + j] + + m_exp.append(r) + + +write_arr(f, 'm_a', m_a) +write_arr(f, 'm_b', m_b) +write_arr(f, 'm_exp', m_exp) + +f.write('#define SIZE %d\n' % SIZE) + + +f.write('__attribute__ ((section(".heapsram"))) char g_mA[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) char g_mB[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) char g_mC[SIZE][SIZE];\n') +f.write('__attribute__ ((section(".heapsram"))) char g_mB_tmp[SIZE][SIZE];\n') + diff --git a/carfield/parMatrixMul8/matrixMul.c b/carfield/parMatrixMul8/matrixMul.c new file mode 100644 index 0000000..357fdf0 --- /dev/null +++ b/carfield/parMatrixMul8/matrixMul.c @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Mantainer: Luca Valente, luca.valente2@unibo.it + */ + +#include "pulp.h" + +#include "parMatrixMul8_stimuli.h" + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()); +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()); + +testcase_t testcases[] = { + { .name = "matrixMul", .test = check_matrix_mul }, + { .name = "matrixMulTranspose", .test = check_matrix_mul_transpose }, + {0, 0} +}; + +unsigned int num_cores; + +int main() +{ + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + num_cores = get_core_num(); + + if(rt_core_id() < num_cores) { + run_suite(testcases); + } + + synch_barrier(); + + return 0; +} + +void matrix_init(); +unsigned int matrix_check(); + +void check_matrix_mul(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB[k][j]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void check_matrix_mul_transpose(testresult_t *result, void (*start)(), void (*stop)()) { + int core_id; + unsigned int i, j, k; + unsigned int chunk; + unsigned int lb, ub; + + core_id = get_core_id(); + + // number of rows each core has to multiply + chunk = SIZE / num_cores; + // lower bound + lb = core_id * chunk; + // upper bound + ub = lb + chunk; + + if(core_id == 0) { + matrix_init(); + } + + if(num_cores != 1) synch_barrier(); + + // start benchmark + start(); + + // transpose array before using it + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mB_tmp[i][j] = g_mB[j][i]; + } + } + + if(num_cores != 1) synch_barrier(); + + for(i = lb; i < ub; i++) { + for(j = 0; j < SIZE; j++) { + g_mC[i][j] = 0; + + for(k = 0; k < SIZE; k++) { + g_mC[i][j] += g_mA[i][k] * g_mB_tmp[j][k]; + } + } + } + + if(num_cores != 1) synch_barrier(); + + stop(); + + if(core_id == 0) { + result->errors = matrix_check(); + } +} + +void matrix_init() { + unsigned int i, j; + + // init, copy to TCDM + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + g_mA[i][j] = m_a[i * SIZE + j]; + g_mB[i][j] = m_b[i * SIZE + j]; + g_mC[i][j] = 0; + } + } +} + +unsigned int matrix_check() { + unsigned int errors = 0; + unsigned int i, j; + // check + for(i = 0; i < SIZE; i++) { + for(j = 0; j < SIZE; j++) { + if(g_mC[i][j] != m_exp[i * SIZE + j]) { + printf("At index %d, %d\n", i, j, 0, 0); + errors++; + } + } + } + + return errors; +} diff --git a/carfield/parMatrixMul8/parMatrixMul8_stimuli.h b/carfield/parMatrixMul8/parMatrixMul8_stimuli.h new file mode 100644 index 0000000..78d026c --- /dev/null +++ b/carfield/parMatrixMul8/parMatrixMul8_stimuli.h @@ -0,0 +1,1742 @@ +const char m_a[] = { +-1, +0, +0, +0, +-4, +-4, +-4, +1, +-1, +-2, +-3, +3, +2, +-4, +1, +-2, +-3, +0, +-3, +-4, +-3, +-1, +-3, +-3, +2, +-2, +2, +3, +0, +2, +0, +-1, +-3, +-2, +-1, +-3, +1, +-1, +-1, +0, +-1, +-2, +-1, +-4, +-2, +-3, +-1, +-2, +1, +1, +2, +-1, +0, +2, +1, +-3, +2, +-3, +0, +0, +0, +2, +-2, +3, +-4, +2, +-2, +-3, +-2, +-3, +3, +-3, +0, +1, +-1, +-1, +1, +-2, +-1, +-4, +-2, +-1, +-4, +-4, +-2, +-1, +-2, +-3, +-4, +-4, +1, +0, +-2, +1, +-1, +1, +2, +-3, +-4, +0, +2, +-3, +-3, +0, +0, +-2, +-3, +-1, +-2, +-1, +-4, +-1, +-4, +-3, +-3, +0, +1, +-3, +-4, +1, +2, +0, +2, +-1, +-4, +-1, +-4, +2, +2, +-1, +3, +0, +0, +-1, +1, +3, +0, +2, +0, +3, +2, +2, +2, +-3, +2, +2, +-4, +0, +-3, +-1, +-1, +-3, +-4, +3, +2, +0, +2, +1, +0, +-2, +-1, +3, +3, +0, +1, +-3, +3, +0, +-3, +2, +-3, +1, +2, +0, +3, +-1, +0, +-2, +-4, +0, +-3, +0, +3, +0, +-1, +-3, +0, +-2, +-4, +-4, +-1, +1, +2, +-4, +1, +-4, +-4, +-3, +-3, +3, +-1, +-2, +-4, +2, +1, +2, +-1, +1, +3, +0, +2, +-4, +-3, +1, +-2, +0, +1, +1, +3, +-2, +-4, +2, +-1, +-1, +1, +2, +-2, +1, +1, +2, +0, +3, +-4, +2, +-1, +-1, +-3, +-3, +-3, +-3, +2, +-3, +1, +0, +3, +-3, +3, +3, +-4, +1, +-4, +1, +2, +-3, +1, +-4, +-1, +-4, +-3, +-2, +2, +3, +0, +-4, +1, +3, +0, +3, +-1, +-4, +3, +-2, +-2, +2, +-4, +-1, +-1, +3, +-2, +-4, +-1, +-1, +2, +-1, +2, +0, +-2, +-4, +0, +2, +-4, +0, +1, +-4, +2, +0, +-1, +-1, +-3, +2, +3, +2, +-2, +1, +-4, +-3, +1, +2, +-1, +-4, +3, +-1, +2, +-2, +-3, +0, +-1, +3, +-4, +-3, +-3, +-3, +0, +0, +3, +1, +-2, +-2, +-4, +2, +2, +0, +3, +-2, +1, +3, +-1, +3, +-4, +-2, +0, +-4, +0, +-1, +-3, +-2, +-4, +-4, +-2, +-4, +-3, +-3, +0, +2, +-4, +1, +0, +-3, +0, +-4, +-4, +3, +3, +-2, +1, +3, +3, +1, +-3, +-4, +-4, +1, +1, +2, +-4, +0, +3, +-1, +3, +1, +-4, +-2, +1, +-1, +3, +3, +-1, +-1, +0, +-2, +2, +-2, +-2, +1, +3, +1, +3, +-1, +2, +-1, +0, +1, +2, +-4, +-3, +1, +-3, +-1, +-4, +-1, +-4, +-4, +-3, +-3, +3, +0, +3, +0, +-2, +3, +-3, +3, +-1, +-1, +2, +-2, +1, +-4, +0, +-3, +3, +0, +-2, +-1, +3, +2, +-2, +3, +-3, +0, +-2, +1, +0, +-1, +1, +3, +-4, +3, +0, +3, +-4, +3, +-1, +-1, +1, +3, +1, +0, +-4, +3, +-3, +-4, +-1, +2, +-3, +-4, +-2, +-3, +2, +3, +-2, +-2, +-4, +-1, +1, +-4, +-2, +2, +-3, +1, +-3, +-2, +2, +3, +-2, +0, +1, +-1, +2, +-2, +1, +-1, +-4, +-3, +-4, +-3, +-3, +-3, +-4, +3, +1, +1, +3, +-4, +0, +-4, +2, +2, +3, +2, +-4, +1, +-1, +-2, +-2, +2, +-2, +2, +-3, +2, +0, +0, +0, +-4, +-2, +-2, +2, +1, +0, +2, +3, +2, +3, +-3, +0, +-3, +-2, +2, +-1, +2, +-3, +1, +0, +1, +3, +-4, +0, +-3, +-1, +1, +-2, +-2, +0, +0, +1, +-3, +-2, +0, +3, +2, +0, +3, +-4, +2, +1, +3, +-2, +-2, +-3, +2, +2, +2, +-1, +3, +2, +-3, +}; + +const char m_b[] = { +3, +-2, +-3, +-4, +-3, +2, +-4, +-1, +-4, +-4, +-1, +-3, +3, +0, +2, +1, +2, +-2, +0, +3, +-4, +-2, +2, +-1, +1, +-4, +-4, +-1, +1, +-3, +-2, +1, +-3, +1, +-2, +2, +1, +-3, +-1, +-4, +1, +-1, +-3, +3, +0, +0, +-2, +0, +-4, +-3, +0, +-3, +0, +3, +-1, +-4, +-2, +2, +-4, +-1, +-3, +1, +-4, +-4, +0, +2, +3, +2, +0, +-3, +-1, +-1, +2, +2, +-2, +-2, +-2, +-4, +3, +-3, +2, +1, +-1, +1, +-3, +-1, +2, +-1, +1, +-1, +0, +-4, +-2, +3, +0, +-4, +-2, +3, +-3, +1, +-1, +-1, +-2, +1, +-2, +-4, +0, +0, +0, +0, +-4, +-3, +3, +-2, +3, +3, +2, +3, +-4, +1, +-2, +-3, +-3, +2, +3, +-4, +-1, +3, +0, +3, +2, +1, +-1, +-1, +-4, +-1, +3, +2, +-1, +3, +0, +3, +0, +1, +-4, +0, +3, +2, +-4, +-2, +-2, +-4, +-3, +-2, +1, +0, +-2, +1, +3, +-2, +-1, +-3, +0, +3, +0, +0, +0, +2, +3, +-4, +-4, +2, +2, +3, +1, +0, +0, +-2, +1, +-2, +-4, +-3, +0, +-2, +2, +3, +2, +3, +3, +1, +1, +1, +3, +1, +3, +2, +-3, +1, +-1, +-2, +-2, +-2, +-1, +2, +2, +0, +-3, +-1, +2, +-4, +-2, +2, +-3, +-2, +-4, +3, +-1, +3, +2, +3, +-1, +-1, +-1, +-2, +-2, +2, +2, +-1, +-3, +-1, +3, +-3, +3, +3, +2, +-3, +2, +-3, +-3, +-2, +3, +-1, +0, +-2, +-1, +-1, +2, +-1, +-2, +1, +-4, +1, +3, +2, +1, +2, +-1, +-1, +-1, +-4, +0, +3, +1, +-2, +0, +2, +-2, +-2, +3, +-1, +-1, +0, +3, +3, +-1, +0, +0, +-4, +1, +-4, +-4, +0, +2, +3, +-3, +-2, +-3, +-3, +-2, +0, +-4, +-1, +0, +-2, +1, +-1, +-4, +1, +2, +0, +-2, +0, +2, +2, +2, +3, +-3, +1, +0, +-2, +2, +3, +-1, +-2, +1, +-3, +-1, +2, +2, +1, +3, +-2, +0, +-2, +-4, +1, +1, +1, +-2, +-1, +2, +0, +1, +-1, +-3, +-1, +1, +-1, +-1, +-3, +-4, +-2, +-2, +-1, +0, +3, +3, +0, +-2, +-2, +-2, +-3, +2, +2, +1, +3, +0, +3, +0, +-1, +-1, +3, +3, +-4, +1, +1, +-2, +-4, +-4, +3, +1, +0, +-1, +-4, +-2, +2, +0, +1, +-1, +0, +-3, +-2, +-1, +1, +-3, +-2, +2, +-4, +-3, +-3, +0, +0, +-3, +-3, +2, +0, +1, +-2, +-3, +-1, +3, +-1, +3, +-3, +-4, +-4, +0, +-4, +-1, +2, +1, +0, +0, +2, +2, +3, +2, +-1, +0, +-3, +-3, +3, +0, +-4, +0, +-2, +-2, +-1, +1, +3, +3, +1, +-3, +1, +-2, +-1, +2, +0, +0, +2, +-1, +-4, +-1, +-3, +0, +2, +-4, +-1, +0, +-2, +-1, +2, +-2, +-2, +3, +3, +0, +3, +1, +0, +2, +3, +2, +-2, +-4, +-1, +3, +3, +-1, +2, +0, +-2, +2, +-4, +-3, +-3, +-3, +-3, +-3, +0, +3, +3, +3, +0, +-3, +-1, +2, +3, +-3, +-3, +-4, +-1, +3, +0, +-4, +-2, +-3, +1, +3, +3, +-2, +2, +-4, +-2, +1, +-3, +-3, +-2, +3, +-3, +0, +-4, +0, +2, +-4, +0, +1, +-1, +3, +3, +3, +1, +3, +-4, +-4, +-1, +-3, +-3, +0, +1, +-1, +-3, +-4, +2, +3, +0, +-4, +-2, +-3, +0, +3, +2, +0, +-2, +-4, +-3, +-3, +-3, +-2, +2, +-4, +-4, +2, +3, +-3, +2, +-2, +-2, +-1, +-2, +-2, +-1, +3, +-4, +-4, +3, +-3, +-3, +-1, +-1, +2, +3, +-3, +-1, +-3, +-4, +}; + +const char m_exp[] = { +27, +-26, +-19, +13, +47, +17, +-5, +14, +88, +66, +2, +14, +6, +-39, +56, +35, +-19, +61, +-13, +4, +46, +1, +70, +27, +-22, +-28, +-13, +-5, +-4, +-6, +-10, +5, +26, +14, +3, +-1, +-3, +0, +10, +43, +4, +32, +-3, +6, +36, +26, +66, +20, +-20, +-10, +21, +-17, +-5, +36, +-9, +11, +10, +17, +-32, +12, +20, +9, +-1, +15, +-42, +-21, +-51, +24, +20, +26, +10, +42, +-4, +2, +-11, +41, +22, +4, +-17, +10, +28, +-10, +2, +10, +25, +32, +18, +43, +-3, +-9, +-9, +-4, +25, +-5, +9, +24, +54, +43, +14, +59, +18, +36, +-7, +30, +50, +-30, +21, +3, +27, +7, +21, +55, +-17, +-2, +9, +17, +31, +8, +12, +29, +43, +-17, +14, +-55, +13, +50, +24, +-13, +-33, +15, +-45, +9, +36, +-9, +-6, +14, +-1, +2, +-27, +-37, +-40, +-14, +2, +-10, +6, +27, +-8, +-13, +-17, +-4, +28, +51, +-5, +-14, +2, +-23, +17, +35, +41, +-3, +-37, +-12, +-49, +-31, +7, +15, +36, +-15, +-21, +6, +2, +36, +-12, +-27, +-34, +9, +44, +8, +57, +20, +-22, +-22, +-11, +-15, +-8, +-15, +-9, +49, +37, +26, +6, +30, +-17, +-43, +-3, +-31, +36, +49, +-15, +8, +51, +-8, +17, +-51, +-16, +-21, +12, +49, +-11, +41, +36, +15, +-5, +-69, +60, +14, +-24, +-21, +23, +3, +41, +41, +-27, +0, +33, +63, +18, +1, +-36, +-20, +18, +7, +-1, +37, +-33, +30, +36, +-29, +38, +57, +-8, +-21, +-26, +15, +-10, +-4, +-17, +-19, +8, +-9, +-13, +0, +10, +-30, +59, +12, +11, +51, +60, +19, +35, +6, +3, +-5, +-19, +-10, +20, +6, +-12, +-5, +5, +11, +33, +-33, +-27, +-2, +-2, +3, +53, +-1, +-15, +-10, +13, +-5, +24, +-17, +10, +8, +-3, +41, +-9, +9, +0, +30, +7, +7, +-10, +-28, +21, +13, +4, +-3, +44, +9, +-49, +-31, +-51, +23, +16, +-13, +-12, +36, +23, +34, +-39, +8, +20, +2, +32, +73, +24, +-16, +0, +-30, +17, +52, +-10, +28, +-43, +-1, +-34, +16, +-4, +39, +31, +5, +5, +3, +36, +48, +5, +28, +12, +57, +25, +-28, +14, +34, +49, +11, +41, +20, +-12, +-18, +-43, +-29, +58, +26, +44, +36, +-100, +-46, +-24, +0, +54, +4, +-2, +21, +5, +-12, +22, +9, +-35, +-5, +-36, +-22, +-10, +-11, +-15, +34, +20, +-33, +17, +39, +-13, +-32, +-36, +-22, +-3, +-38, +-39, +-6, +0, +15, +-6, +-15, +9, +-10, +45, +16, +17, +13, +-8, +15, +11, +-4, +59, +18, +-9, +12, +69, +27, +-30, +16, +7, +30, +41, +-30, +9, +-4, +23, +-5, +8, +3, +-75, +-25, +5, +14, +12, +-21, +-21, +6, +-13, +15, +8, +-6, +30, +24, +41, +-12, +5, +39, +18, +-4, +-60, +19, +-1, +-3, +-7, +16, +-11, +-51, +2, +-10, +-17, +-32, +39, +-35, +-46, +10, +27, +1, +21, +13, +78, +-12, +-2, +-28, +-20, +11, +75, +85, +-18, +21, +-7, +-9, +33, +19, +71, +23, +25, +2, +47, +32, +8, +24, +46, +-22, +23, +18, +-58, +66, +52, +22, +32, +25, +-34, +-26, +-21, +-69, +6, +27, +-28, +-13, +-15, +10, +-20, +-16, +20, +25, +7, +22, +52, +-4, +-8, +-28, +-18, +-6, +-36, +6, +-26, +12, +32, +8, +11, +16, +0, +-18, +-33, +-10, +11, +-40, +-6, +-29, +-34, +-30, +15, +15, +-27, +9, +8, +-43, +0, +-5, +4, +12, +60, +-11, +-4, +-32, +-25, +-38, +-37, +-17, +-1, +20, +37, +43, +7, +-2, +12, +-7, +-7, +-14, +-30, +9, +50, +-18, +-9, +-1, +-9, +6, +16, +-38, +-13, +30, +}; + +#define SIZE 24 +__attribute__ ((section(".heapsram"))) char g_mA[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) char g_mB[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) char g_mC[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) char g_mB_tmp[SIZE][SIZE]; diff --git a/carfield/redmule b/carfield/redmule new file mode 120000 index 0000000..bd0da93 --- /dev/null +++ b/carfield/redmule @@ -0,0 +1 @@ +../hwpe/redmule/ \ No newline at end of file diff --git a/fpu_tests.yaml b/fpu_tests.yaml new file mode 100644 index 0000000..963d381 --- /dev/null +++ b/fpu_tests.yaml @@ -0,0 +1,11 @@ +fpu_tests: + fp32: + path: ./fpu_tests/matmul/FP32 + command: make clean all cores=8 platform=rtl fmt_A=FP32 fmt_B=FP32 fmt_OUT=FP32 thr=0.004f check=1 run + fp16: + path: ./fpu_tests/matmul/FP16 + command: make clean all cores=8 platform=rtl fmt_A=FP16 fmt_B=FP16 fmt_OUT=FP16 thr=0.004f check=1 vect=1 run + fp16alt: + path: ./fpu_tests/matmul/FP16ALT + command: make clean all cores=8 platform=rtl fmt_A=FP16ALT fmt_B=FP16ALT fmt_OUT=FP16ALT thr=0.04f check=1 vect=1 run + \ No newline at end of file diff --git a/fpu_tests/matmul/FP16/Makefile b/fpu_tests/matmul/FP16/Makefile new file mode 100644 index 0000000..f9116c0 --- /dev/null +++ b/fpu_tests/matmul/FP16/Makefile @@ -0,0 +1,82 @@ +PULP_APP = test + +PULP_APP_FC_SRCS = main.c +PULP_APP_SRCS = support_func.c matmul.c + +PULP_CFLAGS += -O3 -g3 +PULP_CFLAGS += -mno-memcpy + +ifdef cores +PULP_CFLAGS += -DNUM_CORES=${cores} #-flto -DFABRIC=1 +else +PULP_CFLAGS += -DNUM_CORES=1 +endif + + +ifdef FABRIC +PULP_CFLAGS += -DFABRIC +endif + +ifdef cores +PULP_CFLAGS += -DUSE_INTRINSICS +endif + +ifdef thr +PULP_CFLAGS += -DTHR=${thr} +endif + +PULP_CFLAGS += -fno-tree-vectorize + + +ifdef fmt +PULP_CFLAGS += -D${fmt} -DFIXED + +else +# FP FORMAT +#INPUT DATA TYPE +ifdef fmt_A +PULP_CFLAGS += -DMA${fmt_A} +else +PULP_CFLAGS += -DMAFP32 +endif + +#FILTER DATA TYPE +ifdef fmt_B +PULP_CFLAGS += -DMB${fmt_B} +else +PULP_CFLAGS += -DMBFP32 +endif + +# OUTPUT DATA TYPE + +ifdef fmt_OUT +PULP_CFLAGS += -DOUT${fmt_OUT} +else +PULP_CFLAGS += -DOUTFP32 +endif +endif + +# VECTORIAL FORMAT for half-precision FP +ifdef vec +PULP_CFLAGS += -DVECTORIAL +endif + +# CHECK RESULTS +ifdef check +PULP_CFLAGS += -DCHECK +endif + +ifdef PRINT_RESULTS +PULP_CFLAGS += -DPRINT_RESULTS +endif + +ifdef verbose +PULP_CFLAGS += -DVERBOSE +endif + +# STATISTICS +ifdef stats +PULP_CFLAGS += -DSTATS +endif + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/fpu_tests/matmul/FP16/config.h b/fpu_tests/matmul/FP16/config.h new file mode 100644 index 0000000..9505082 --- /dev/null +++ b/fpu_tests/matmul/FP16/config.h @@ -0,0 +1,96 @@ +#ifndef _CONFIG_MATMUL_ +#define _CONFIG_MATMUL_ + +#ifdef FABRIC +#define DATA_LOCATION +#else +#define DATA_LOCATION L1_DATA +#endif + +//Define INPUT data types + +#ifdef FIXED + #ifdef FP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 MA_TYPE; + typedef float16 MB_TYPE; + typedef float16 OUT_TYPE; + typedef float16 MA_VTYPE __attribute__((vector_size (4))); + typedef float16 MB_VTYPE __attribute__((vector_size (4))); + typedef float16 OUT_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif defined(FP16ALT) + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt MA_TYPE; + typedef float16alt MB_TYPE; + typedef float16alt OUT_TYPE; + typedef float16alt MA_VTYPE __attribute__((vector_size (4))); + typedef float16alt MB_VTYPE __attribute__((vector_size (4))); + typedef float16alt OUT_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif defined(FP32) + typedef float MA_TYPE; + typedef float MB_TYPE; + typedef float OUT_TYPE; + #endif + +#else // MIXED + #ifdef MAFP32 + typedef float MA_TYPE; + #elif MAFP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 MA_TYPE; + typedef float16 MA_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif MAFP16ALT + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt MA_TYPE; + typedef float16alt MA_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #endif + + #ifdef MBFP32 + typedef float MB_TYPE; + #elif MBFP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 MB_TYPE; + typedef float16 MB_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif MBFP16ALT + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt MB_TYPE; + typedef float16alt MB_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #endif + // Define output data types + #ifdef OUTFP32 + typedef float OUT_TYPE; + #elif OUTFP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 OUT_TYPE; + typedef float16 OUT_VTYPE __attribute__((vector_size (4))); + #elif OUTFP16ALT + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt OUT_TYPE; + typedef float16alt OUT_VTYPE __attribute__((vector_size (4))); + #endif + +#endif + +#ifndef THR + #define THR 0.004f +#endif +#ifdef VECTORIAL + #if defined(MAFP16) && defined (MBFP16ALT) || defined (MAFP16ALT) && defined (MBFP16) + #error "Vecotrization does not work for different data types...!!!" + #endif + + #if defined (MAFP32) || defined (MBFP32) || defined (OUTFP32) + + #error "Vecotrization does not work for FP32 data type...!!!" + #endif +#endif + +void matMul(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C, int M, int N, int P); + +#endif diff --git a/fpu_tests/matmul/FP16/data.h b/fpu_tests/matmul/FP16/data.h new file mode 100644 index 0000000..5c74f33 --- /dev/null +++ b/fpu_tests/matmul/FP16/data.h @@ -0,0 +1,7 @@ +#define M 8 +#define N 8 +#define P 8 + +PI_L2 MA_TYPE A_mat[] = {-1.4462890625, -0.329345703125, 0.3203125, -1.083984375, 0.25146484375, 0.15234375, -1.6630859375, 0.71728515625, 0.1917724609375, 0.5859375, 2.029296875, -0.1700439453125, 0.2003173828125, -0.1534423828125, 1.2353515625, 0.73681640625, 0.38427734375, -0.6748046875, 0.60791015625, -0.253662109375, 1.62890625, 1.1982421875, -0.134521484375, -0.3447265625, -1.3662109375, 0.039459228515625, 0.1907958984375, -1.96875, -0.18115234375, -0.11669921875, -0.1326904296875, 0.281494140625, 0.70556640625, -0.58837890625, -0.7724609375, 0.07135009765625, -0.1737060546875, 0.486328125, 0.018463134765625, -0.364501953125, 0.478271484375, 0.98828125, 0.6455078125, -0.2303466796875, 0.306640625, 0.22119140625, 0.364013671875, 0.0013647079467773438, -0.43408203125, -0.59375, 0.168701171875, 0.9921875, -0.04827880859375, -0.6396484375, 0.9873046875, -0.09088134765625, -0.61376953125, 1.4345703125, 0.400634765625, 0.259033203125, 0.93115234375, 1.4326171875, -1.08984375, -1.767578125, }; +PI_L2 MB_TYPE B_mat[] = {-0.345703125, 0.41845703125, -1.3466796875, -0.826171875, 1.490234375, -0.86474609375, 1.185546875, -0.12469482421875, -0.51025390625, -0.724609375, -0.057769775390625, -1.701171875, -0.92236328125, -0.422119140625, 0.21533203125, -0.0709228515625, -0.497314453125, -0.81298828125, 1.9052734375, -0.60546875, -1.1884765625, 1.6962890625, -0.306884765625, 0.1558837890625, 0.53759765625, 0.360107421875, 0.128662109375, -1.00390625, -0.8955078125, -0.16064453125, 0.73583984375, -1.662109375, 0.64404296875, 1.5068359375, 0.03680419921875, -0.92431640625, -0.62255859375, -1.263671875, -0.92529296875, 1.1123046875, -1.1728515625, 2.85546875, 1.126953125, -1.2001953125, 1.0205078125, 1.5361328125, 1.1337890625, -1.283203125, 1.568359375, -0.95703125, -0.2474365234375, -1.115234375, -0.416748046875, 1.0478515625, 1.419921875, 1.8359375, -0.41845703125, 0.007236480712890625, -1.646484375, -0.01568603515625, 1.05078125, -0.89013671875, 0.55126953125, -1.8427734375, }; +PI_L2 OUT_TYPE ref[] = {-2.998046875, 1.3935546875, 1.8505859375, 4.07421875, 0.18408203125, -0.357421875, -4.7109375, -2.236328125, 0.471923828125, -3.369140625, 1.869140625, -3.60546875, -2.53515625, 3.203125, 1.4072265625, 1.86328125, -0.650390625, 6.06640625, 2.66015625, -2.0703125, 0.6025390625, 0.97265625, -0.591796875, 1.1796875, -1.0068359375, -1.943359375, 1.37890625, 3.375, -0.19140625, 1.46484375, -3.115234375, 2.65625, -0.0216064453125, 2.482421875, -1.240234375, 0.376220703125, 2.66015625, -0.372802734375, 1.5380859375, -0.396484375, -0.607421875, -0.378662109375, 0.66748046875, -3.19140625, -0.87548828125, 0.6337890625, 0.8974609375, 1.076171875, 3.20703125, -2.375, 0.2509765625, -0.0181121826171875, -2.318359375, 0.94677734375, 0.70751953125, 1.220703125, -2.62890625, 4.99609375, 6.3671875, -3.77734375, -3.46875, 2.01953125, -2.111328125, 0.0625, }; diff --git a/fpu_tests/matmul/FP16/main.c b/fpu_tests/matmul/FP16/main.c new file mode 100644 index 0000000..7d12782 --- /dev/null +++ b/fpu_tests/matmul/FP16/main.c @@ -0,0 +1,122 @@ +// License, Version 0.51 (the "License"); you may not use this file except in +// or agreed to in writing, software, hardware and materials distributed under +// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "config.h" +#include "pulp.h" + +#include +#include +#include /* for CHAR_BIT */ +#include + +#include "data.h" + +#define STACK_SIZE 2048 + +void main_fn(testresult_t *result, void (*start)(), void (*stop)()); + +testcase_t testcases[] = { + { .name = "Matrix Multiplication", .test = main_fn }, + {0, 0} +}; + +DATA_LOCATION MA_TYPE matA[M*N] __attribute__ ((aligned (4))); +DATA_LOCATION MB_TYPE matB[N*P] __attribute__ ((aligned (4))); +DATA_LOCATION OUT_TYPE matC[M*P] __attribute__ ((aligned (4))); + +// End of computation +int done = 0; + +int retval = 0; + +void __attribute__ ((noinline)) matrix_init(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C) { + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++){ + A[i*N+j] = A_mat[i*N+j]; + + + } + + for (int i = 0; i < N; i++) + for (int j = 0; j < P; j++){ + B[i*P+j] = B_mat[i*P+j]; + } + for (int i = 0; i < M; i++) + for (int j = 0; j < P; j++) + C[i*P+j] = 0; + +} + +int __attribute ((noinline)) check_result(OUT_TYPE * __restrict__ result) { + #ifndef FABRIC + synch_barrier(); + #endif + + if(get_core_id() == 0) { + float diff; + int err = 0; + + for (int i = 0; i < (M*P); i++) { + diff = fabs(result[i] - ref[i]); + if(diff > THR) { + err++; + #ifdef VERBOSE + + printf("Error at index %d:\t refrence %f\t output %f\t error %.4f\n", i, ref[i], result[i], diff); + #endif + + } + + #ifdef PRINT_RESULTS + + printf("index %d:\t refrence %f\t output %f\t error %f\n", i, ref[i], result[i], diff); + #endif + } + + return err; + + } +} + +void main_fn(testresult_t *result, void (*start)(), void (*stop)()){ + + if (get_core_id() == 0) + matrix_init(matA, matB, matC); + + #ifndef FABRIC + synch_barrier(); + #endif + + #ifdef STATS + start(); + #endif + matMul(matA, matB, matC, M, N, P); + + #ifdef STATS + stop(); + #endif + + #ifdef CHECK + result->errors = check_result(matC); + #endif +}; + +int main() +{ + #ifdef FABRIC + main_fn(); + #else + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + int nbErrors = run_suite(testcases); + + synch_barrier(); + #endif + retval = nbErrors; + + return retval; +} diff --git a/fpu_tests/matmul/FP16/matmul.c b/fpu_tests/matmul/FP16/matmul.c new file mode 100644 index 0000000..0a616ca --- /dev/null +++ b/fpu_tests/matmul/FP16/matmul.c @@ -0,0 +1,103 @@ +#include "config.h" +#include "pulp.h" + +#ifdef VECTORIAL + +void __attribute__ ((noinline)) matMul(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C, int M, int N, int P){ + + OUT_VTYPE temp; + MA_VTYPE Av; + MB_VTYPE Bv0; + MB_VTYPE Bv1; + OUT_VTYPE *Cv; + int blockSize = (M+NUM_CORES-1)/NUM_CORES; + int start = get_core_id()*blockSize; + int end = start + blockSize < M? start + blockSize : M; + + for (int i = start; i < end; i++) { + for (int j=0; j < (P & 0xfffffffe); j+=2) { + + temp = (OUT_VTYPE) {0, 0}; + + // Manual unrolling + for (int k=0; k<(N & 0xfffffffe); k+=2){ + Av = *((MA_VTYPE *) &A[i*N+k]); + Bv0 = *((MB_VTYPE *) &B[k*P+j]); + Bv1 = *((MB_VTYPE *) &B[k*P+j+P]); + temp += (OUT_VTYPE)(__builtin_shuffle(Av, (v2s){0,0})) * Bv0; + temp += (OUT_VTYPE)(__builtin_shuffle(Av, (v2s){1,1})) * Bv1; + } + + if (N & 0x00000001) + { + temp[0] += A[i*N+N-1] * B[(N-1)*P+j]; + temp[1] += A[i*N+N-1] * B[(N-1)*P+j+1]; + } + Cv = (OUT_VTYPE *) &C[i*P+j]; + + *Cv = temp; + } + } + /// Leftover in P + if (P & 0x00000001) + { + for (int i = start; i < end; i++) { + + OUT_TYPE temp1 = 0; + + // Manual unrolling + for (int k=0; k<(N & 0xfffffffe); k+=2){ + temp1 += A[i*N+k] * B[k*P+P-1]; + temp1 += A[i*N+k+1] * B[k*P+P-1+P]; + } + if (N & 0x00000001) + { + temp1 += A[i*N+N-1] * B[(N-1)*P+P-1]; + } + C[i*P+(P-1)]=temp1; + } + } + + #if NUM_CORES > 1 + synch_barrier(); + #endif +} +#else + +void __attribute__ ((noinline)) matMul(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C, int M, int N, int P) { + + int blockSize = (M+NUM_CORES-1)/NUM_CORES; + int start = get_core_id()*blockSize; + int end = start + blockSize < M? start + blockSize : M; + + for (int i = start; i < end; i++) { + for (int j = 0; j < P; j++) { + OUT_TYPE temp = 0; + + //Manual unrolling + for (int k = 0; k < (N & 0xfffffffe); k+=2) { + temp += (OUT_TYPE)(A[i*N+k] * B[k*P+j]); + temp += (OUT_TYPE)(A[i*N+k+1] * B[k*P+j+P]); + + } + C[i*P+j] = (OUT_TYPE)(temp); + } + } + // Leftover on N + + if (N & 0x00000001) + { + for (int i=start; i 1 + synch_barrier(); + #endif +} +#endif diff --git a/fpu_tests/matmul/FP16/support_func.c b/fpu_tests/matmul/FP16/support_func.c new file mode 100644 index 0000000..6d24d2e --- /dev/null +++ b/fpu_tests/matmul/FP16/support_func.c @@ -0,0 +1,15 @@ +#include "config.h" +#include "pulp.h" +double __extendohfdf2(float16alt value) +{ + float result; + __asm__ __volatile__ ("fcvt.s.ah %0, %1": "=f"(result): "f"(value) :); + return (double) result; +} + +double __extendhfdf2(float16 value) +{ + float result; + __asm__ __volatile__ ("fcvt.s.h %0, %1": "=f"(result): "f"(value) :); + return (double) result; +} diff --git a/fpu_tests/matmul/FP16ALT/Makefile b/fpu_tests/matmul/FP16ALT/Makefile new file mode 100644 index 0000000..f9116c0 --- /dev/null +++ b/fpu_tests/matmul/FP16ALT/Makefile @@ -0,0 +1,82 @@ +PULP_APP = test + +PULP_APP_FC_SRCS = main.c +PULP_APP_SRCS = support_func.c matmul.c + +PULP_CFLAGS += -O3 -g3 +PULP_CFLAGS += -mno-memcpy + +ifdef cores +PULP_CFLAGS += -DNUM_CORES=${cores} #-flto -DFABRIC=1 +else +PULP_CFLAGS += -DNUM_CORES=1 +endif + + +ifdef FABRIC +PULP_CFLAGS += -DFABRIC +endif + +ifdef cores +PULP_CFLAGS += -DUSE_INTRINSICS +endif + +ifdef thr +PULP_CFLAGS += -DTHR=${thr} +endif + +PULP_CFLAGS += -fno-tree-vectorize + + +ifdef fmt +PULP_CFLAGS += -D${fmt} -DFIXED + +else +# FP FORMAT +#INPUT DATA TYPE +ifdef fmt_A +PULP_CFLAGS += -DMA${fmt_A} +else +PULP_CFLAGS += -DMAFP32 +endif + +#FILTER DATA TYPE +ifdef fmt_B +PULP_CFLAGS += -DMB${fmt_B} +else +PULP_CFLAGS += -DMBFP32 +endif + +# OUTPUT DATA TYPE + +ifdef fmt_OUT +PULP_CFLAGS += -DOUT${fmt_OUT} +else +PULP_CFLAGS += -DOUTFP32 +endif +endif + +# VECTORIAL FORMAT for half-precision FP +ifdef vec +PULP_CFLAGS += -DVECTORIAL +endif + +# CHECK RESULTS +ifdef check +PULP_CFLAGS += -DCHECK +endif + +ifdef PRINT_RESULTS +PULP_CFLAGS += -DPRINT_RESULTS +endif + +ifdef verbose +PULP_CFLAGS += -DVERBOSE +endif + +# STATISTICS +ifdef stats +PULP_CFLAGS += -DSTATS +endif + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/fpu_tests/matmul/FP16ALT/config.h b/fpu_tests/matmul/FP16ALT/config.h new file mode 100644 index 0000000..9505082 --- /dev/null +++ b/fpu_tests/matmul/FP16ALT/config.h @@ -0,0 +1,96 @@ +#ifndef _CONFIG_MATMUL_ +#define _CONFIG_MATMUL_ + +#ifdef FABRIC +#define DATA_LOCATION +#else +#define DATA_LOCATION L1_DATA +#endif + +//Define INPUT data types + +#ifdef FIXED + #ifdef FP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 MA_TYPE; + typedef float16 MB_TYPE; + typedef float16 OUT_TYPE; + typedef float16 MA_VTYPE __attribute__((vector_size (4))); + typedef float16 MB_VTYPE __attribute__((vector_size (4))); + typedef float16 OUT_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif defined(FP16ALT) + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt MA_TYPE; + typedef float16alt MB_TYPE; + typedef float16alt OUT_TYPE; + typedef float16alt MA_VTYPE __attribute__((vector_size (4))); + typedef float16alt MB_VTYPE __attribute__((vector_size (4))); + typedef float16alt OUT_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif defined(FP32) + typedef float MA_TYPE; + typedef float MB_TYPE; + typedef float OUT_TYPE; + #endif + +#else // MIXED + #ifdef MAFP32 + typedef float MA_TYPE; + #elif MAFP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 MA_TYPE; + typedef float16 MA_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif MAFP16ALT + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt MA_TYPE; + typedef float16alt MA_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #endif + + #ifdef MBFP32 + typedef float MB_TYPE; + #elif MBFP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 MB_TYPE; + typedef float16 MB_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif MBFP16ALT + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt MB_TYPE; + typedef float16alt MB_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #endif + // Define output data types + #ifdef OUTFP32 + typedef float OUT_TYPE; + #elif OUTFP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 OUT_TYPE; + typedef float16 OUT_VTYPE __attribute__((vector_size (4))); + #elif OUTFP16ALT + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt OUT_TYPE; + typedef float16alt OUT_VTYPE __attribute__((vector_size (4))); + #endif + +#endif + +#ifndef THR + #define THR 0.004f +#endif +#ifdef VECTORIAL + #if defined(MAFP16) && defined (MBFP16ALT) || defined (MAFP16ALT) && defined (MBFP16) + #error "Vecotrization does not work for different data types...!!!" + #endif + + #if defined (MAFP32) || defined (MBFP32) || defined (OUTFP32) + + #error "Vecotrization does not work for FP32 data type...!!!" + #endif +#endif + +void matMul(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C, int M, int N, int P); + +#endif diff --git a/fpu_tests/matmul/FP16ALT/data.h b/fpu_tests/matmul/FP16ALT/data.h new file mode 100644 index 0000000..f01329d --- /dev/null +++ b/fpu_tests/matmul/FP16ALT/data.h @@ -0,0 +1,7 @@ +#define M 8 +#define N 8 +#define P 8 + +PI_L2 MA_TYPE A_mat[] = {-0.1435546875, -2.203125, 0.20703125, -0.00146484375, -0.10498046875, -0.255859375, 1.3515625, -0.7109375, 1.171875, 0.283203125, 0.390625, 1.7890625, 1.4921875, 0.59375, 0.5625, -2.40625, -0.76953125, 0.83203125, 0.89453125, 0.1142578125, 0.365234375, -0.55859375, 2.15625, -0.5859375, -1.1328125, -0.55859375, -0.1259765625, 1.4921875, 0.41015625, 0.11669921875, -0.78515625, -1.0859375, -0.71484375, -0.67578125, -0.76171875, 1.2109375, 1.1484375, 0.66015625, 0.6875, -0.74609375, -0.03564453125, 0.72265625, 0.0849609375, -1.234375, 0.72265625, -2.140625, -0.97265625, -2.328125, -0.9453125, 0.09912109375, -0.00640869140625, -0.59765625, -0.5078125, -0.75, 0.04248046875, -0.431640625, -0.326171875, -0.52734375, 0.5078125, -1.0078125, -0.07275390625, 0.125, 0.055419921875, -1.09375, }; +PI_L2 MB_TYPE B_mat[] = {-3.21875, 0.76953125, -0.875, -0.44921875, 1.0078125, -1.7265625, 0.369140625, 0.287109375, 0.13671875, -0.50390625, -0.486328125, 0.71484375, 0.392578125, -0.1494140625, 0.5625, 0.451171875, -0.16796875, 0.40234375, 0.3671875, 1.4921875, -0.796875, 1.1015625, 0.388671875, 0.97265625, -0.609375, 1.3046875, -0.0189208984375, -1.328125, 0.0986328125, 0.67578125, -0.09716796875, -0.1962890625, 0.671875, 0.271484375, 0.7421875, -0.8828125, -1.0625, -2.171875, 0.30859375, 1.4453125, -1.5703125, -0.57421875, 1.1015625, 0.953125, 0.396484375, 0.77734375, 0.435546875, 1.9375, -0.0255126953125, 1.8046875, 1.3203125, 0.80078125, -2.140625, 0.5625, 0.08203125, -0.6796875, -1.6328125, -0.08544921875, -1.3046875, 0.3828125, 0.3515625, 0.333984375, -0.89453125, -0.1181640625, }; +PI_L2 OUT_TYPE ref[] = {1.59375, 3.703125, 3.609375, -0.5390625, -4.3125, 1.359375, -0.61328125, -2.3125, -0.890625, 4.53125, 4.59375, -3.34375, -2.25, -3.6875, 3.5, 3.71875, 4.40625, 3.84375, 3.859375, 2.78125, -6.59375, 2.0625, 1.09375, -0.953125, 4.5625, 0.0224609375, 2.0, -3.359375, -0.201171875, 1.3125, 0.15234375, 0.4921875, 2.53125, 2.296875, 4.09375, -3.03125, -2.953125, -0.5390625, 0.30078125, 1.0703125, 8.625, -2.109375, -0.34375, -2.0625, -0.3046875, -5.34375, 1.8359375, -1.5390625, 4.9375, -1.15625, 0.203125, 0.8828125, -0.96875, 1.609375, -0.328125, -2.296875, 3.046875, -0.9921875, 2.328125, 1.671875, -1.4140625, 0.443359375, 0.890625, 0.5859375, }; diff --git a/fpu_tests/matmul/FP16ALT/main.c b/fpu_tests/matmul/FP16ALT/main.c new file mode 100644 index 0000000..7d12782 --- /dev/null +++ b/fpu_tests/matmul/FP16ALT/main.c @@ -0,0 +1,122 @@ +// License, Version 0.51 (the "License"); you may not use this file except in +// or agreed to in writing, software, hardware and materials distributed under +// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "config.h" +#include "pulp.h" + +#include +#include +#include /* for CHAR_BIT */ +#include + +#include "data.h" + +#define STACK_SIZE 2048 + +void main_fn(testresult_t *result, void (*start)(), void (*stop)()); + +testcase_t testcases[] = { + { .name = "Matrix Multiplication", .test = main_fn }, + {0, 0} +}; + +DATA_LOCATION MA_TYPE matA[M*N] __attribute__ ((aligned (4))); +DATA_LOCATION MB_TYPE matB[N*P] __attribute__ ((aligned (4))); +DATA_LOCATION OUT_TYPE matC[M*P] __attribute__ ((aligned (4))); + +// End of computation +int done = 0; + +int retval = 0; + +void __attribute__ ((noinline)) matrix_init(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C) { + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++){ + A[i*N+j] = A_mat[i*N+j]; + + + } + + for (int i = 0; i < N; i++) + for (int j = 0; j < P; j++){ + B[i*P+j] = B_mat[i*P+j]; + } + for (int i = 0; i < M; i++) + for (int j = 0; j < P; j++) + C[i*P+j] = 0; + +} + +int __attribute ((noinline)) check_result(OUT_TYPE * __restrict__ result) { + #ifndef FABRIC + synch_barrier(); + #endif + + if(get_core_id() == 0) { + float diff; + int err = 0; + + for (int i = 0; i < (M*P); i++) { + diff = fabs(result[i] - ref[i]); + if(diff > THR) { + err++; + #ifdef VERBOSE + + printf("Error at index %d:\t refrence %f\t output %f\t error %.4f\n", i, ref[i], result[i], diff); + #endif + + } + + #ifdef PRINT_RESULTS + + printf("index %d:\t refrence %f\t output %f\t error %f\n", i, ref[i], result[i], diff); + #endif + } + + return err; + + } +} + +void main_fn(testresult_t *result, void (*start)(), void (*stop)()){ + + if (get_core_id() == 0) + matrix_init(matA, matB, matC); + + #ifndef FABRIC + synch_barrier(); + #endif + + #ifdef STATS + start(); + #endif + matMul(matA, matB, matC, M, N, P); + + #ifdef STATS + stop(); + #endif + + #ifdef CHECK + result->errors = check_result(matC); + #endif +}; + +int main() +{ + #ifdef FABRIC + main_fn(); + #else + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + int nbErrors = run_suite(testcases); + + synch_barrier(); + #endif + retval = nbErrors; + + return retval; +} diff --git a/fpu_tests/matmul/FP16ALT/matmul.c b/fpu_tests/matmul/FP16ALT/matmul.c new file mode 100644 index 0000000..0a616ca --- /dev/null +++ b/fpu_tests/matmul/FP16ALT/matmul.c @@ -0,0 +1,103 @@ +#include "config.h" +#include "pulp.h" + +#ifdef VECTORIAL + +void __attribute__ ((noinline)) matMul(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C, int M, int N, int P){ + + OUT_VTYPE temp; + MA_VTYPE Av; + MB_VTYPE Bv0; + MB_VTYPE Bv1; + OUT_VTYPE *Cv; + int blockSize = (M+NUM_CORES-1)/NUM_CORES; + int start = get_core_id()*blockSize; + int end = start + blockSize < M? start + blockSize : M; + + for (int i = start; i < end; i++) { + for (int j=0; j < (P & 0xfffffffe); j+=2) { + + temp = (OUT_VTYPE) {0, 0}; + + // Manual unrolling + for (int k=0; k<(N & 0xfffffffe); k+=2){ + Av = *((MA_VTYPE *) &A[i*N+k]); + Bv0 = *((MB_VTYPE *) &B[k*P+j]); + Bv1 = *((MB_VTYPE *) &B[k*P+j+P]); + temp += (OUT_VTYPE)(__builtin_shuffle(Av, (v2s){0,0})) * Bv0; + temp += (OUT_VTYPE)(__builtin_shuffle(Av, (v2s){1,1})) * Bv1; + } + + if (N & 0x00000001) + { + temp[0] += A[i*N+N-1] * B[(N-1)*P+j]; + temp[1] += A[i*N+N-1] * B[(N-1)*P+j+1]; + } + Cv = (OUT_VTYPE *) &C[i*P+j]; + + *Cv = temp; + } + } + /// Leftover in P + if (P & 0x00000001) + { + for (int i = start; i < end; i++) { + + OUT_TYPE temp1 = 0; + + // Manual unrolling + for (int k=0; k<(N & 0xfffffffe); k+=2){ + temp1 += A[i*N+k] * B[k*P+P-1]; + temp1 += A[i*N+k+1] * B[k*P+P-1+P]; + } + if (N & 0x00000001) + { + temp1 += A[i*N+N-1] * B[(N-1)*P+P-1]; + } + C[i*P+(P-1)]=temp1; + } + } + + #if NUM_CORES > 1 + synch_barrier(); + #endif +} +#else + +void __attribute__ ((noinline)) matMul(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C, int M, int N, int P) { + + int blockSize = (M+NUM_CORES-1)/NUM_CORES; + int start = get_core_id()*blockSize; + int end = start + blockSize < M? start + blockSize : M; + + for (int i = start; i < end; i++) { + for (int j = 0; j < P; j++) { + OUT_TYPE temp = 0; + + //Manual unrolling + for (int k = 0; k < (N & 0xfffffffe); k+=2) { + temp += (OUT_TYPE)(A[i*N+k] * B[k*P+j]); + temp += (OUT_TYPE)(A[i*N+k+1] * B[k*P+j+P]); + + } + C[i*P+j] = (OUT_TYPE)(temp); + } + } + // Leftover on N + + if (N & 0x00000001) + { + for (int i=start; i 1 + synch_barrier(); + #endif +} +#endif diff --git a/fpu_tests/matmul/FP16ALT/support_func.c b/fpu_tests/matmul/FP16ALT/support_func.c new file mode 100644 index 0000000..6d24d2e --- /dev/null +++ b/fpu_tests/matmul/FP16ALT/support_func.c @@ -0,0 +1,15 @@ +#include "config.h" +#include "pulp.h" +double __extendohfdf2(float16alt value) +{ + float result; + __asm__ __volatile__ ("fcvt.s.ah %0, %1": "=f"(result): "f"(value) :); + return (double) result; +} + +double __extendhfdf2(float16 value) +{ + float result; + __asm__ __volatile__ ("fcvt.s.h %0, %1": "=f"(result): "f"(value) :); + return (double) result; +} diff --git a/fpu_tests/matmul/FP32/Makefile b/fpu_tests/matmul/FP32/Makefile new file mode 100644 index 0000000..f9116c0 --- /dev/null +++ b/fpu_tests/matmul/FP32/Makefile @@ -0,0 +1,82 @@ +PULP_APP = test + +PULP_APP_FC_SRCS = main.c +PULP_APP_SRCS = support_func.c matmul.c + +PULP_CFLAGS += -O3 -g3 +PULP_CFLAGS += -mno-memcpy + +ifdef cores +PULP_CFLAGS += -DNUM_CORES=${cores} #-flto -DFABRIC=1 +else +PULP_CFLAGS += -DNUM_CORES=1 +endif + + +ifdef FABRIC +PULP_CFLAGS += -DFABRIC +endif + +ifdef cores +PULP_CFLAGS += -DUSE_INTRINSICS +endif + +ifdef thr +PULP_CFLAGS += -DTHR=${thr} +endif + +PULP_CFLAGS += -fno-tree-vectorize + + +ifdef fmt +PULP_CFLAGS += -D${fmt} -DFIXED + +else +# FP FORMAT +#INPUT DATA TYPE +ifdef fmt_A +PULP_CFLAGS += -DMA${fmt_A} +else +PULP_CFLAGS += -DMAFP32 +endif + +#FILTER DATA TYPE +ifdef fmt_B +PULP_CFLAGS += -DMB${fmt_B} +else +PULP_CFLAGS += -DMBFP32 +endif + +# OUTPUT DATA TYPE + +ifdef fmt_OUT +PULP_CFLAGS += -DOUT${fmt_OUT} +else +PULP_CFLAGS += -DOUTFP32 +endif +endif + +# VECTORIAL FORMAT for half-precision FP +ifdef vec +PULP_CFLAGS += -DVECTORIAL +endif + +# CHECK RESULTS +ifdef check +PULP_CFLAGS += -DCHECK +endif + +ifdef PRINT_RESULTS +PULP_CFLAGS += -DPRINT_RESULTS +endif + +ifdef verbose +PULP_CFLAGS += -DVERBOSE +endif + +# STATISTICS +ifdef stats +PULP_CFLAGS += -DSTATS +endif + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/fpu_tests/matmul/FP32/config.h b/fpu_tests/matmul/FP32/config.h new file mode 100644 index 0000000..9505082 --- /dev/null +++ b/fpu_tests/matmul/FP32/config.h @@ -0,0 +1,96 @@ +#ifndef _CONFIG_MATMUL_ +#define _CONFIG_MATMUL_ + +#ifdef FABRIC +#define DATA_LOCATION +#else +#define DATA_LOCATION L1_DATA +#endif + +//Define INPUT data types + +#ifdef FIXED + #ifdef FP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 MA_TYPE; + typedef float16 MB_TYPE; + typedef float16 OUT_TYPE; + typedef float16 MA_VTYPE __attribute__((vector_size (4))); + typedef float16 MB_VTYPE __attribute__((vector_size (4))); + typedef float16 OUT_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif defined(FP16ALT) + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt MA_TYPE; + typedef float16alt MB_TYPE; + typedef float16alt OUT_TYPE; + typedef float16alt MA_VTYPE __attribute__((vector_size (4))); + typedef float16alt MB_VTYPE __attribute__((vector_size (4))); + typedef float16alt OUT_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif defined(FP32) + typedef float MA_TYPE; + typedef float MB_TYPE; + typedef float OUT_TYPE; + #endif + +#else // MIXED + #ifdef MAFP32 + typedef float MA_TYPE; + #elif MAFP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 MA_TYPE; + typedef float16 MA_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif MAFP16ALT + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt MA_TYPE; + typedef float16alt MA_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #endif + + #ifdef MBFP32 + typedef float MB_TYPE; + #elif MBFP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 MB_TYPE; + typedef float16 MB_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #elif MBFP16ALT + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt MB_TYPE; + typedef float16alt MB_VTYPE __attribute__((vector_size (4))); + #undef USE_INTRINSICS + #endif + // Define output data types + #ifdef OUTFP32 + typedef float OUT_TYPE; + #elif OUTFP16 + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16 OUT_TYPE; + typedef float16 OUT_VTYPE __attribute__((vector_size (4))); + #elif OUTFP16ALT + typedef signed short v2s __attribute__((vector_size (4))); + typedef float16alt OUT_TYPE; + typedef float16alt OUT_VTYPE __attribute__((vector_size (4))); + #endif + +#endif + +#ifndef THR + #define THR 0.004f +#endif +#ifdef VECTORIAL + #if defined(MAFP16) && defined (MBFP16ALT) || defined (MAFP16ALT) && defined (MBFP16) + #error "Vecotrization does not work for different data types...!!!" + #endif + + #if defined (MAFP32) || defined (MBFP32) || defined (OUTFP32) + + #error "Vecotrization does not work for FP32 data type...!!!" + #endif +#endif + +void matMul(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C, int M, int N, int P); + +#endif diff --git a/fpu_tests/matmul/FP32/data.h b/fpu_tests/matmul/FP32/data.h new file mode 100644 index 0000000..18b3c69 --- /dev/null +++ b/fpu_tests/matmul/FP32/data.h @@ -0,0 +1,7 @@ +#define M 8 +#define N 8 +#define P 8 + +PI_L2 MA_TYPE A_mat[] = {-1.321346640586853, -0.14075148105621338, -1.3874461650848389, 0.28156477212905884, -0.9179959893226624, -1.2968281507492065, 0.19895713031291962, -0.1636660248041153, 0.09745144098997116, -0.4712808132171631, -1.2986916303634644, -1.3279190063476562, -0.6729908585548401, 0.21538715064525604, 0.0033319147769361734, -0.3368145227432251, -0.4880366027355194, -1.6056562662124634, -0.06749758124351501, 0.537982702255249, -1.518497109413147, -0.14922605454921722, 1.2041049003601074, 0.08387433737516403, 1.0984026193618774, -0.27249911427497864, -0.6649801731109619, -0.6570414304733276, -0.3877118229866028, -0.819801926612854, 0.21040719747543335, 2.10652232170105, -0.1445361226797104, 0.8136182427406311, -0.6583006978034973, -0.3472365438938141, -1.0367175340652466, 1.103456974029541, -0.19918836653232574, 0.05155167728662491, 1.4965381622314453, 0.9884462356567383, 1.1312751770019531, -0.5285311341285706, -0.37252330780029297, -0.5133379697799683, 0.3439998924732208, -0.6833076477050781, -1.3308806419372559, -0.0043663098476827145, 1.0170615911483765, -0.2980661988258362, -0.6847434043884277, 0.132366344332695, 1.3589580059051514, -0.03737796097993851, -0.5342410802841187, -0.7537646293640137, -1.2974865436553955, -0.5541737079620361, -0.4874458611011505, 1.8318110704421997, 0.20764854550361633, 1.5493804216384888, }; +PI_L2 MB_TYPE B_mat[] = {-0.8827683925628662, -0.5542697310447693, 0.14389587938785553, -1.975010633468628, 0.20784620940685272, 0.013685223646461964, -0.9441406726837158, 0.2295272946357727, 0.18610191345214844, -0.022595224902033806, -2.596245288848877, 0.2539166808128357, 0.181132510304451, 0.5449540615081787, -0.025274118408560753, -0.07331778109073639, -0.7438264489173889, -0.23840615153312683, 1.428622841835022, -0.34748250246047974, -1.5984361171722412, 0.20564231276512146, -1.1283481121063232, -0.9706998467445374, 1.064945936203003, 0.5016964077949524, 1.2105423212051392, -0.5837512612342834, -0.983349084854126, 1.118739128112793, -0.43573465943336487, -0.2066168338060379, 2.088926315307617, 0.21492750942707062, 0.019352668896317482, -1.092134952545166, 2.3897173404693604, 0.4519634544849396, -1.5889497995376587, -1.6419037580490112, 0.4415695369243622, -2.7164535522460938, 0.9438610076904297, -0.4197200834751129, -0.4402216672897339, -1.895782709121704, 1.4696519374847412, -0.043221864849328995, -0.6337109208106995, 0.13316427171230316, 1.1930891275405884, -1.0255396366119385, 0.17155316472053528, 0.0703289583325386, -0.42407506704330444, 0.22795960307121277, 0.6689052581787109, 1.656773567199707, -0.28926777839660645, 0.4480336606502533, -0.4898584485054016, 0.8577049374580383, -0.40668636560440063, -1.1225101947784424, }; +PI_L2 OUT_TYPE ref[] = {-0.25370487570762634, 4.288405895233154, -2.4230763912200928, 4.161197662353516, 0.1321820169687271, 1.8521130084991455, 2.2288806438446045, 2.7880353927612305, -2.160022497177124, -1.68727707862854, -1.93358314037323, 1.4045865535736084, 1.7790610790252686, -3.0093014240264893, 3.4853577613830566, 3.0664453506469727, -3.1897361278533936, 0.9710832238197327, 5.895380020141602, 0.7893388867378235, -4.211012363433838, -0.540488064289093, 1.99186372756958, 2.640082359313965, -1.1216058731079102, 4.887918949127197, -2.019449234008789, -0.1283983588218689, 0.3265409469604492, 2.1952335834503174, -1.5282257795333862, -0.5912652015686035, -1.118781328201294, -3.1169605255126953, -2.7250733375549316, 1.8199642896652222, -1.511614203453064, -2.6127007007598877, 4.342489242553711, 2.1691524982452393, -4.221386909484863, -1.1585693359375, -1.2581945657730103, -2.8258895874023438, -1.068955421447754, 0.44341498613357544, -2.514594793319702, 0.7613731622695923, -2.158013105392456, -0.041946373879909515, 2.655857563018799, 1.7298434972763062, -3.0531845092773438, -0.6418022513389587, 0.9603833556175232, 0.23947691917419434, 1.4017069339752197, -2.1417248249053955, 0.8747122287750244, 1.8828234672546387, -0.3232908248901367, -3.6543807983398438, 4.977419853210449, 0.33591794967651367, }; diff --git a/fpu_tests/matmul/FP32/main.c b/fpu_tests/matmul/FP32/main.c new file mode 100644 index 0000000..7d12782 --- /dev/null +++ b/fpu_tests/matmul/FP32/main.c @@ -0,0 +1,122 @@ +// License, Version 0.51 (the "License"); you may not use this file except in +// or agreed to in writing, software, hardware and materials distributed under +// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "config.h" +#include "pulp.h" + +#include +#include +#include /* for CHAR_BIT */ +#include + +#include "data.h" + +#define STACK_SIZE 2048 + +void main_fn(testresult_t *result, void (*start)(), void (*stop)()); + +testcase_t testcases[] = { + { .name = "Matrix Multiplication", .test = main_fn }, + {0, 0} +}; + +DATA_LOCATION MA_TYPE matA[M*N] __attribute__ ((aligned (4))); +DATA_LOCATION MB_TYPE matB[N*P] __attribute__ ((aligned (4))); +DATA_LOCATION OUT_TYPE matC[M*P] __attribute__ ((aligned (4))); + +// End of computation +int done = 0; + +int retval = 0; + +void __attribute__ ((noinline)) matrix_init(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C) { + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++){ + A[i*N+j] = A_mat[i*N+j]; + + + } + + for (int i = 0; i < N; i++) + for (int j = 0; j < P; j++){ + B[i*P+j] = B_mat[i*P+j]; + } + for (int i = 0; i < M; i++) + for (int j = 0; j < P; j++) + C[i*P+j] = 0; + +} + +int __attribute ((noinline)) check_result(OUT_TYPE * __restrict__ result) { + #ifndef FABRIC + synch_barrier(); + #endif + + if(get_core_id() == 0) { + float diff; + int err = 0; + + for (int i = 0; i < (M*P); i++) { + diff = fabs(result[i] - ref[i]); + if(diff > THR) { + err++; + #ifdef VERBOSE + + printf("Error at index %d:\t refrence %f\t output %f\t error %.4f\n", i, ref[i], result[i], diff); + #endif + + } + + #ifdef PRINT_RESULTS + + printf("index %d:\t refrence %f\t output %f\t error %f\n", i, ref[i], result[i], diff); + #endif + } + + return err; + + } +} + +void main_fn(testresult_t *result, void (*start)(), void (*stop)()){ + + if (get_core_id() == 0) + matrix_init(matA, matB, matC); + + #ifndef FABRIC + synch_barrier(); + #endif + + #ifdef STATS + start(); + #endif + matMul(matA, matB, matC, M, N, P); + + #ifdef STATS + stop(); + #endif + + #ifdef CHECK + result->errors = check_result(matC); + #endif +}; + +int main() +{ + #ifdef FABRIC + main_fn(); + #else + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + int nbErrors = run_suite(testcases); + + synch_barrier(); + #endif + retval = nbErrors; + + return retval; +} diff --git a/fpu_tests/matmul/FP32/matmul.c b/fpu_tests/matmul/FP32/matmul.c new file mode 100644 index 0000000..0a616ca --- /dev/null +++ b/fpu_tests/matmul/FP32/matmul.c @@ -0,0 +1,103 @@ +#include "config.h" +#include "pulp.h" + +#ifdef VECTORIAL + +void __attribute__ ((noinline)) matMul(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C, int M, int N, int P){ + + OUT_VTYPE temp; + MA_VTYPE Av; + MB_VTYPE Bv0; + MB_VTYPE Bv1; + OUT_VTYPE *Cv; + int blockSize = (M+NUM_CORES-1)/NUM_CORES; + int start = get_core_id()*blockSize; + int end = start + blockSize < M? start + blockSize : M; + + for (int i = start; i < end; i++) { + for (int j=0; j < (P & 0xfffffffe); j+=2) { + + temp = (OUT_VTYPE) {0, 0}; + + // Manual unrolling + for (int k=0; k<(N & 0xfffffffe); k+=2){ + Av = *((MA_VTYPE *) &A[i*N+k]); + Bv0 = *((MB_VTYPE *) &B[k*P+j]); + Bv1 = *((MB_VTYPE *) &B[k*P+j+P]); + temp += (OUT_VTYPE)(__builtin_shuffle(Av, (v2s){0,0})) * Bv0; + temp += (OUT_VTYPE)(__builtin_shuffle(Av, (v2s){1,1})) * Bv1; + } + + if (N & 0x00000001) + { + temp[0] += A[i*N+N-1] * B[(N-1)*P+j]; + temp[1] += A[i*N+N-1] * B[(N-1)*P+j+1]; + } + Cv = (OUT_VTYPE *) &C[i*P+j]; + + *Cv = temp; + } + } + /// Leftover in P + if (P & 0x00000001) + { + for (int i = start; i < end; i++) { + + OUT_TYPE temp1 = 0; + + // Manual unrolling + for (int k=0; k<(N & 0xfffffffe); k+=2){ + temp1 += A[i*N+k] * B[k*P+P-1]; + temp1 += A[i*N+k+1] * B[k*P+P-1+P]; + } + if (N & 0x00000001) + { + temp1 += A[i*N+N-1] * B[(N-1)*P+P-1]; + } + C[i*P+(P-1)]=temp1; + } + } + + #if NUM_CORES > 1 + synch_barrier(); + #endif +} +#else + +void __attribute__ ((noinline)) matMul(MA_TYPE * __restrict__ A, MB_TYPE * __restrict__ B, OUT_TYPE * __restrict__ C, int M, int N, int P) { + + int blockSize = (M+NUM_CORES-1)/NUM_CORES; + int start = get_core_id()*blockSize; + int end = start + blockSize < M? start + blockSize : M; + + for (int i = start; i < end; i++) { + for (int j = 0; j < P; j++) { + OUT_TYPE temp = 0; + + //Manual unrolling + for (int k = 0; k < (N & 0xfffffffe); k+=2) { + temp += (OUT_TYPE)(A[i*N+k] * B[k*P+j]); + temp += (OUT_TYPE)(A[i*N+k+1] * B[k*P+j+P]); + + } + C[i*P+j] = (OUT_TYPE)(temp); + } + } + // Leftover on N + + if (N & 0x00000001) + { + for (int i=start; i 1 + synch_barrier(); + #endif +} +#endif diff --git a/fpu_tests/matmul/FP32/out.txt b/fpu_tests/matmul/FP32/out.txt new file mode 100644 index 0000000..9950041 --- /dev/null +++ b/fpu_tests/matmul/FP32/out.txt @@ -0,0 +1,15 @@ +GNU Make 3.82 +Built for x86_64-redhat-linux-gnu +Copyright (C) 2010 Free Software Foundation, Inc. +License GPLv3+: GNU GPL version 3 or later +This is free software: you are free to change and redistribute it. +There is NO WARRANTY, to the extent permitted by law. +Lettura dei makefile... +/scratch2/rtedeschi/pulp_cluster/alsaqr_cluster/pulp-runtime/rules/pulpos/targets/pulp_cluster.mk:74: warning: overriding recipe for target `run' +/scratch2/rtedeschi/pulp_cluster/alsaqr_cluster/pulp-runtime/rules/pulpos/default_rules.mk:322: warning: ignoring old recipe for target `run' +Aggiornamento degli obbiettivi.... + Il file «all» non esiste. + Il file «/scratch2/rtedeschi/pulp_cluster/alsaqr_cluster/regression-tests/fpu_tests/matmul/FP32/build/fp_matmul/fp_matmul» non esiste. + Il file «/scratch2/rtedeschi/pulp_cluster/alsaqr_cluster/regression-tests/fpu_tests/matmul/FP32/build/fp_matmul/support_func.o» non esiste. + L'obiettivo «/scratch2/rtedeschi/pulp_cluster/alsaqr_cluster/regression-tests/fpu_tests/matmul/FP32/build/fp_matmul/support_func.o» deve essere rigenerato. +make: *** Nessuna regola per generare l'obiettivo «/scratch2/rtedeschi/pulp_cluster/alsaqr_cluster/regression-tests/fpu_tests/matmul/FP32/build/fp_matmul/support_func.o», necessario per «/scratch2/rtedeschi/pulp_cluster/alsaqr_cluster/regression-tests/fpu_tests/matmul/FP32/build/fp_matmul/fp_matmul». Stop. diff --git a/fpu_tests/matmul/FP32/support_func.c b/fpu_tests/matmul/FP32/support_func.c new file mode 100644 index 0000000..6d24d2e --- /dev/null +++ b/fpu_tests/matmul/FP32/support_func.c @@ -0,0 +1,15 @@ +#include "config.h" +#include "pulp.h" +double __extendohfdf2(float16alt value) +{ + float result; + __asm__ __volatile__ ("fcvt.s.ah %0, %1": "=f"(result): "f"(value) :); + return (double) result; +} + +double __extendhfdf2(float16 value) +{ + float result; + __asm__ __volatile__ ("fcvt.s.h %0, %1": "=f"(result): "f"(value) :); + return (double) result; +} diff --git a/fpu_tests/matmul/README.md b/fpu_tests/matmul/README.md new file mode 100644 index 0000000..538deba --- /dev/null +++ b/fpu_tests/matmul/README.md @@ -0,0 +1,38 @@ +# MatMul test +This test performs a matrix multiplication on FP32/FP16/FP16ALT data and can also be used to measure performances. +In this folder you can find pre-generated golden models. + +## Running a test +After the platform and the SDK setup you can run the test: + +~~~~~shell +make clean all [platform=rtl] run +~~~~~ + +If you want to run this test on RTL, remember to specify the platform which is gvsoc by default. +There are several flags useful to activate some functionalities: + +- `cores=N_CORES` set the number of cores used for the execution to `N_CORES`, by default `cores=1`. There is also the ability to run on the Fabric controller by using `FABRIC=1` instead of `cores=N_CORE`. +- `fmt=FP_FMT` specifies the floating-point format for data, by deafult it is set to `FP32` but you can also choose `FP16` or `FP16ALT` formats. **For this application you can use mixed-precision in the C code by using `fmt_A=FP_A fmt_B=FP_B fmt_OUT=FP_OUT` instead of `fmt`.** +- `vec=1` activates vectorial format **only for half-precision floating point (FP16 and FP16ALT)** +- `check=1` activates the result check +- `verbose=1` prints the wrong results +- `stats=1` activates performance measurement +- `PRINT_RESULTS=1` print outputs of C code + + +## Generating the golden model +If you want to re-generate a golden model, you can use the [data_generator.py](./data_generator.py) script with the following command: + +~~~~~shell +./data_generator.py --M=m --N=n --P=p --float_type=fmt --MAC_flag=MAC_FLAG +~~~~~ +- specifies the floating-point format for data, by deafult it is set to `FP32` but you can also choose `FP16` and `FP16ALT` formats. **Also, you can run the mixed-precision golden model by using `--float_type=FP_A,FP_B,FP_OUT`.** +- `MAC_flag` is used to emulate the multiply-and-add operator available on most DSP instruction sets for embedded devices. It can be true or false. To emulate `FP16` and `FP16ALT` behavior on PULP, true this flag. + +The script will generate three floating-point matrices fo format `fmt` (FP32/FP16/FP16ALT): +- A_mat[m,n] input matrix +- B_mat[n,p] input matrix +- ref[m,p] output matrix + +The generated header file will be written in the [references](./references) folder. diff --git a/fpu_tests/matmul/data_generator.py b/fpu_tests/matmul/data_generator.py new file mode 100755 index 0000000..26905a2 --- /dev/null +++ b/fpu_tests/matmul/data_generator.py @@ -0,0 +1,206 @@ +#!/bin/python3 + +import os +import argparse +import sys + +import torch +from torch import nn + + +def relative_absolute_error(true, pred): + true_mean = torch.mean(true) + squared_error_num = torch.sum(torch.abs(true - pred)) + squared_error_den = torch.sum(torch.abs(true - true_mean)) + rae_loss = squared_error_num / squared_error_den + return rae_loss + + +def mean_squared_error(true, pred): + squared_error = torch.square(true - pred) + sum_squared_error = torch.sum(squared_error) + size = true.size(dim=0) * true.size(dim=1) + mse_loss = sum_squared_error / size + return mse_loss + + +def matrix_init(IN, dt): + temp = torch.zeros((IN.shape[0], IN.shape[1]), dtype=dt) + # iterate through rows of IN + for i in range(IN.shape[0]): + # iterate through columns of IN + for j in range(IN.shape[1]): + temp[i][j] = IN[i][j] + return temp + + +def error_metric(ref, res): + + # calculate manually because metrics doesn't supprt bfloat16 + d = ref - res + mse_f = torch.mean(d**2) + mae_f = torch.mean(abs(d)) + rmse_f = torch.sqrt(mse_f) + r2_f = 1-(torch.sum(d**2)/torch.sum((ref-torch.mean(ref))**2)) + print("Results of metrics:") + print("MAE:",mae_f.item()) + print("MSE:", mse_f.item()) + print("RMSE:", rmse_f.item()) + print("R-Squared:", r2_f.item()) + rae = relative_absolute_error(ref, res) + print("RAE is", rae.item()) + + +def matrix_mult(Xs, Ys, dt, mac_flag, cast_flag, cast_to): + Rs = torch.zeros((Xs.shape[0], Ys.shape[1]), dtype=dt) + # iterate through rows of X + for i in range(Xs.shape[0]): + # iterate through columns of Y + for j in range(Ys.shape[1]): + temp = torch.tensor([0], dtype=dt) + # iterate through rows of Y + for k in range(Ys.shape[0]): + a = Xs[i][k] + b = Ys[k][j] + if cast_flag == "true": + if cast_to == "FP16": + a = a.type(torch.float16) + b = b.type(torch.float16) + elif cast_to == "FP16ALT": + a = a.type(torch.bfloat16) + b = b.type(torch.bfloat16) + if mac_flag == "true": + a = a.type(torch.float32) + b = b.type(torch.float32) + temp = temp.type(torch.float32) + temp += a * b + if mac_flag == "true": + temp = temp.type(dt) + + Rs[i][j] = temp + return Rs + + +def write_matrix(matrix_to_write, name, file_pointer, float_type): + matrix_string = '' + sz0 = matrix_to_write.size()[0] + sz1 = matrix_to_write.size()[1] + if 'ref' in name: + file_pointer.write("PI_L2 OUT_TYPE %s[] = {" % name) + elif 'A_mat' in name: + file_pointer.write("PI_L2 MA_TYPE %s[] = {" % name) + else: + file_pointer.write("PI_L2 MB_TYPE %s[] = {" % name) + if float_type == torch.float32: + name = ")" + elif float_type == torch.float16: + name = ", dtype=torch.float16)" + elif float_type == torch.bfloat16: + name = ", dtype=torch.bfloat16)" + for i in range(sz0): + for j in range(sz1): + matrix_string += str(matrix_to_write[i][j].item()).replace('tensor(', '').replace(name, '') + matrix_string += ', ' + file_pointer.write("%s" % matrix_string) + file_pointer.write("};\n") + + +def get_inital_config(): + # get arguments and data format + parser = argparse.ArgumentParser() + parser.add_argument('--M') + parser.add_argument('--N') + parser.add_argument('--P') + + parser.add_argument('--MAC_flag', default="true") + parser.add_argument('--float_type', default='FP32') + args = parser.parse_args() + + M = int(args.M) + N = int(args.N) + P = int(args.P) + mac_flag = str(args.MAC_flag) + bits = args.float_type.split(",") + return M, N, P, bits, mac_flag + + +def select_dtypes(user_dtypes, num_param): + types_dict = { + "FP32": torch.float32, + "FP16": torch.float16, + "FP16ALT": torch.bfloat16 + } + dtypes = [] + if len(user_dtypes) == 1: + for i in range(num_param): + dtypes.append(types_dict[user_dtypes[0]]) + elif len(user_dtypes) == num_param: + for i in range(num_param): + dtypes.append(types_dict[user_dtypes[i]]) + else: + for i in range(len(user_dtypes)): + dtypes.append(types_dict[user_dtypes[i]]) + if 'FP32' in user_dtypes: + for i in range(len(user_dtypes), num_param): + dtypes.append(types_dict["FP32"]) + elif 'FP16' in user_dtypes: + for i in range(len(user_dtypes), num_param): + dtypes.append(types_dict["FP16"]) + else: + for i in range(len(user_dtypes), num_param): + dtypes.append(types_dict["FP16ALT"]) + return dtypes + +def check_cast(datatypes): + result = len(set(datatypes)) == 1 + if result : #All Elements in List are Equal + return "false" + else: #All Elements in List are Not Equal + if torch.float32 in datatypes: + return "false" + else: + return "true" + +def save_data_into_hfile(M, N, P, A_mat, B_mat, res): + # Generate header file + f = open('data.h', 'w') + f.write('\ +#define M %s\n\ +#define N %s\n\ +#define P %s\n\n' % (M, N, P)) + write_matrix(A_mat, 'A_mat', f, A_mat.dtype) + write_matrix(B_mat, 'B_mat', f, B_mat.dtype) + write_matrix(res, 'ref', f, res.dtype) + + f.close() + + +def main(): + M, N, P, bits, mac_flag = get_inital_config() + + # Create reference matrices + A_ref = torch.randn((M, N), dtype=torch.float32) + B_ref = torch.randn((N, P), dtype=torch.float32) + + # calculate reference output + ref = matrix_mult(Xs=A_ref, Ys=B_ref, dt=torch.float32, mac_flag=mac_flag, cast_flag="false",cast_to="false") + + # set the data types based on the parser input + datatypes = select_dtypes(bits, 3) + + cast_flag = check_cast(datatypes[0:2]) + cast_to = "FP16ALT" + A_mat = matrix_init(A_ref, dt=datatypes[0]) + B_mat = matrix_init(B_ref, dt=datatypes[1]) + + res = matrix_mult(Xs=A_mat, Ys=B_mat, dt=datatypes[2], mac_flag=mac_flag, cast_flag=cast_flag, cast_to = cast_to) + + error_metric(ref, res) + save_data_into_hfile(M, N, P, A_mat, B_mat, res) + print("############################## Done! ###################################") + return None + + +if __name__ == "__main__": + main() + pass diff --git a/hwpe/neureka/Makefile b/hwpe/neureka/Makefile new file mode 100644 index 0000000..22ba1d0 --- /dev/null +++ b/hwpe/neureka/Makefile @@ -0,0 +1,49 @@ +ACCELERATOR ?= neureka + +LIBDIR := $(abspath ./pulp-nnx) +ACC_DIR := $(LIBDIR)/$(ACCELERATOR) + +## Test +INC_DIRS += inc +PULP_APP_SRCS += $(wildcard src/*.c) +SRC_DIRS += src + +## Library +INC_DIRS += $(LIBDIR)/inc $(LIBDIR)/util +PULP_APP_SRCS += $(LIBDIR)/src/pulp_nnx_$(ACCELERATOR).c $(wildcard $(LIBDIR)/util/*.c) +SRC_DIRS += $(LIBDIR)/src $(LIBDIR)/util + +## Accelerator +INC_DIRS += $(ACC_DIR)/hal $(ACC_DIR)/bsp $(ACC_DIR)/bsp/pulp_cluster +PULP_APP_SRCS += $(wildcard $(ACC_DIR)/hal/*.c) $(wildcard $(ACC_DIR)/bsp/pulp_cluster/*.c) +SRC_DIRS += $(ACC_DIR)/hal $(ACC_DIR)/bsp/pulp_cluster + +## Generated +INC_DIRS += gen/inc +SRC_DIRS += gen/src + +INC_FLAGS += $(addprefix -I,$(INC_DIRS)) + +# Flags +ACCELERATOR_UPPERCASE := $(shell echo $(ACCELERATOR) | tr [:lower:] [:upper:]) +PULP_CFLAGS += -DNNX_ACCELERATOR=\"$(ACCELERATOR)\" -DNNX_$(ACCELERATOR_UPPERCASE) -DNNX_NEUREKA_PULP_CLUSTER -DNNX_NEUREKA_PE_H=4 -DNNX_NEUREKA_PE_W=4 +PULP_CFLAGS += $(INC_FLAGS) -O3 + +PULP_APP = test + +ifeq ($(no_ecc),1) + PULP_CFLAGS += -DNO_ECC +endif + +ifeq ($(fault_inject),1) + export FAULT_INJECTION=1 + export FAULT_INJECTION_SCRIPT=$(CURDIR)/pulp_inject_fault.tcl +endif + +ifeq ($(multi_bit_upset),1) + export MULTI_BIT_UPSET=1 +else + export MULTI_BIT_UPSET=0 +endif + +include $(PULP_SDK_HOME)/install/rules/pulp_rt.mk diff --git a/hwpe/neureka/gen/inc/bias.h b/hwpe/neureka/gen/inc/bias.h new file mode 100644 index 0000000..30d072b --- /dev/null +++ b/hwpe/neureka/gen/inc/bias.h @@ -0,0 +1,14 @@ +#ifndef __BIAS_H__ +#define __BIAS_H__ + +#include + +#define BIAS_SIZE (32) +PI_L1 int32_t bias[BIAS_SIZE] = { + -0x65c83656, 0x79b5657f, -0x57f742d4, 0x48370263, -0x322008d7, -0x40c1a507, -0x6cb54499, -0x76a3f065, -0x13494905, -0x17d3f6d0, + -0x64732a81, 0x60b4468a, 0x1176b544, -0x1168daef, -0x40e1a4fd, -0x3746219b, 0x18afd65e, 0x6f6ff1d7, -0x61e9d6e3, 0x45b33266, + 0x79d3867b, 0x1c699c9e, 0x6ebc54cb, -0x7f508e3e, -0x1bb33223, 0x6e771d3c, -0x42009579, 0x2c809ab3, -0x3fe5e70c, -0x468955b7, + -0x4405229, -0x79a9c840 +}; + +#endif // __BIAS_H__ diff --git a/hwpe/neureka/gen/inc/input.h b/hwpe/neureka/gen/inc/input.h new file mode 100644 index 0000000..2c7c3e5 --- /dev/null +++ b/hwpe/neureka/gen/inc/input.h @@ -0,0 +1,65 @@ +#ifndef __INPUT_H__ +#define __INPUT_H__ + +#include + +#define INPUT_SIZE (512) + +PI_L1 uint8_t input[INPUT_SIZE] = { + 0xc2, 0x9c, 0xad, 0x2f, 0x37, 0xd4, 0x88, 0x59, 0x67, 0xd8, + 0x73, 0x01, 0x9c, 0x5f, 0xc4, 0x64, 0xd9, 0x74, 0xa9, 0xe7, + 0x18, 0x9a, 0x6e, 0xd7, 0x77, 0xc2, 0x71, 0xa6, 0xdc, 0x3f, + 0xab, 0x2b, 0x36, 0x6c, 0x87, 0x3d, 0x9b, 0x68, 0x64, 0x62, + 0xc4, 0x13, 0x65, 0x5c, 0x3c, 0xcf, 0xd0, 0x0a, 0x12, 0xc1, + 0x18, 0xb4, 0x82, 0x15, 0x77, 0x44, 0x92, 0xb6, 0x6d, 0x09, + 0xc7, 0x0a, 0x41, 0xc1, 0xe9, 0xfe, 0x82, 0x73, 0x1c, 0x1d, + 0xd4, 0xb8, 0x49, 0x81, 0x3e, 0x3a, 0xa7, 0x32, 0x41, 0x19, + 0xcb, 0x37, 0xc8, 0x56, 0x72, 0x34, 0x91, 0x11, 0xa4, 0x99, + 0x9d, 0xc5, 0x7a, 0x50, 0x22, 0x65, 0x87, 0xc6, 0xe1, 0x95, + 0xe8, 0x3a, 0x61, 0xdb, 0x01, 0xb3, 0x56, 0x46, 0xa9, 0xdd, + 0x21, 0xb2, 0xb9, 0xbf, 0x93, 0x3f, 0x33, 0x87, 0x86, 0x65, + 0x20, 0xef, 0xdf, 0xd3, 0xdc, 0xb6, 0xe8, 0x5e, 0xdf, 0xdd, + 0x43, 0xad, 0xbc, 0x81, 0x6e, 0x8b, 0x61, 0x28, 0xef, 0xc6, + 0x5c, 0x7f, 0x01, 0x5a, 0x66, 0x52, 0x5a, 0x14, 0x0f, 0x32, + 0x12, 0x07, 0xb9, 0x98, 0xc8, 0x81, 0x2b, 0x67, 0x07, 0x7f, + 0xcc, 0x2c, 0x68, 0x1f, 0xd9, 0xda, 0xc9, 0x5a, 0x49, 0x8b, + 0x24, 0x03, 0x16, 0xa8, 0x5b, 0xc8, 0x4b, 0x77, 0xfd, 0x50, + 0x0a, 0x3d, 0x7d, 0x1e, 0x99, 0xe4, 0x5b, 0x3b, 0xb2, 0x89, + 0xf5, 0x12, 0x43, 0xec, 0x65, 0x1a, 0x4d, 0x13, 0x3d, 0x61, + 0x95, 0x48, 0x24, 0xf9, 0x0c, 0xed, 0xba, 0xad, 0x0d, 0x93, + 0xed, 0x35, 0x51, 0x30, 0x48, 0x11, 0x04, 0x5b, 0xcd, 0xa7, + 0xc1, 0x1d, 0x5c, 0xd3, 0xa3, 0xd1, 0x67, 0xb9, 0x14, 0xdf, + 0x96, 0x54, 0x5b, 0x07, 0x27, 0x39, 0xef, 0x81, 0x75, 0x0b, + 0x2a, 0x5b, 0x78, 0x3d, 0x19, 0x5e, 0x20, 0x69, 0x38, 0xc8, + 0x68, 0x4a, 0x2d, 0x39, 0xb5, 0x93, 0x8e, 0xb3, 0x8f, 0x17, + 0xc3, 0xb8, 0x59, 0x2d, 0x9b, 0x2a, 0x4c, 0x93, 0x13, 0xe9, + 0x25, 0x52, 0xf4, 0x6d, 0x9e, 0x84, 0x9c, 0x47, 0x29, 0xdc, + 0xa5, 0x0d, 0x27, 0xf1, 0x74, 0x51, 0x0b, 0x1e, 0xf3, 0xce, + 0x7b, 0xc4, 0x19, 0x71, 0x2d, 0x58, 0x3b, 0x1f, 0x46, 0xd7, + 0xe9, 0x4e, 0xe8, 0x2a, 0x7f, 0x30, 0x06, 0xbb, 0x0a, 0x33, + 0x5e, 0xce, 0x69, 0x46, 0x08, 0x44, 0xca, 0xb2, 0x55, 0xab, + 0x65, 0x6d, 0xe2, 0xce, 0xe4, 0xbe, 0x99, 0x76, 0x3c, 0xd1, + 0x96, 0x71, 0xa5, 0x24, 0x48, 0x69, 0xee, 0xe9, 0x10, 0xf5, + 0x69, 0xbe, 0x66, 0x5e, 0x87, 0x23, 0x2f, 0xe1, 0xdc, 0x05, + 0x28, 0x2b, 0x92, 0x3b, 0x04, 0x2f, 0x56, 0xb2, 0x38, 0x70, + 0x5e, 0x6d, 0xca, 0x08, 0x5b, 0x23, 0x30, 0xf0, 0x34, 0xd1, + 0xc7, 0x7a, 0x56, 0xb8, 0x5a, 0x57, 0x01, 0xe4, 0xc8, 0x80, + 0x7a, 0x2e, 0xed, 0xee, 0x51, 0xf9, 0x10, 0x8b, 0x54, 0x61, + 0x20, 0x3a, 0x90, 0x0b, 0xaa, 0x65, 0x81, 0xc6, 0x99, 0x40, + 0x23, 0xc2, 0x24, 0xf2, 0x61, 0x8c, 0x42, 0x06, 0xb4, 0xa3, + 0x0f, 0xed, 0x6b, 0xd5, 0x2d, 0xb5, 0x09, 0x96, 0xe3, 0x5b, + 0x2e, 0x3a, 0x62, 0xd7, 0x7f, 0x29, 0x03, 0x8a, 0x26, 0x71, + 0x10, 0xae, 0x28, 0x3b, 0x1a, 0xf3, 0x9d, 0xf6, 0x37, 0x23, + 0x84, 0x3c, 0x1f, 0x13, 0xdf, 0x74, 0x63, 0xa2, 0xb2, 0x88, + 0xee, 0xf3, 0x6a, 0xcd, 0x8c, 0xa9, 0x5a, 0xf5, 0xc9, 0xc8, + 0x53, 0xc8, 0xd4, 0x58, 0x44, 0x27, 0x0e, 0x3d, 0xb0, 0x72, + 0xd2, 0x3d, 0x31, 0xf3, 0x76, 0xf2, 0x83, 0x03, 0xfc, 0xd1, + 0x29, 0xc0, 0x19, 0x81, 0xb9, 0x65, 0x8b, 0xf4, 0xe1, 0x8f, + 0xf3, 0x80, 0xc2, 0xe1, 0xb2, 0xd0, 0x27, 0x9a, 0x0d, 0xd7, + 0xf5, 0x46, 0x6d, 0x37, 0xdf, 0x16, 0xe5, 0x40, 0x63, 0xac, + 0xdd, 0xe2 +}; + + + +#endif // __INPUT_H__ diff --git a/hwpe/neureka/gen/inc/layer_conf.h b/hwpe/neureka/gen/inc/layer_conf.h new file mode 100644 index 0000000..42e2de0 --- /dev/null +++ b/hwpe/neureka/gen/inc/layer_conf.h @@ -0,0 +1,42 @@ +#ifndef __LAYER_CONF_H__ +#define __LAYER_CONF_H__ + +#define TEST_NAME "test" +#define INPUT_HEIGHT (4) +#define INPUT_WIDTH (4) +#define INPUT_CHANNEL (32) +#define INPUT_SIGNED (0) +#define INPUT_BITS (8) + +#define OUTPUT_HEIGHT (4) +#define OUTPUT_WIDTH (4) +#define OUTPUT_CHANNEL (32) +#define OUTPUT_BITS (8) + +#define WEIGHT_HEIGHT (1) +#define WEIGHT_WIDTH (1) +#define WEIGHT_CHANNEL_IN (32) +#define WEIGHT_CHANNEL_OUT (32) +#define WEIGHT_BITS (8) +#define WEIGHT_OFFSET (-128) + +#define SCALE_BITS (8) + +#define BIAS_BITS (32) + +#define PADDING_TOP (0) +#define PADDING_BOTTOM (0) +#define PADDING_LEFT (0) +#define PADDING_RIGHT (0) +#define PADDING_VALUE (0) + +#define STRIDE_HEIGHT (1) +#define STRIDE_WIDTH (1) + +#define GROUPS (1) +#define OUTSHIFT (23) +#define HAS_NORM_QUANT (1) +#define HAS_BIAS (1) +#define HAS_RELU (1) + +#endif // __LAYER_CONF_H__ diff --git a/hwpe/neureka/gen/inc/output.h b/hwpe/neureka/gen/inc/output.h new file mode 100644 index 0000000..472a3bb --- /dev/null +++ b/hwpe/neureka/gen/inc/output.h @@ -0,0 +1,10 @@ +#ifndef __OUTPUT_H__ +#define __OUTPUT_H__ + +#include + +#define OUTPUT_SIZE (512) +PI_L1 uint8_t output[OUTPUT_SIZE]; + + +#endif // __OUTPUT_H__ diff --git a/hwpe/neureka/gen/inc/scale.h b/hwpe/neureka/gen/inc/scale.h new file mode 100644 index 0000000..8ae79a5 --- /dev/null +++ b/hwpe/neureka/gen/inc/scale.h @@ -0,0 +1,15 @@ +#ifndef __SCALE_H__ +#define __SCALE_H__ + +#include + +#define SCALE_SIZE (32) +PI_L1 uint8_t scale[SCALE_SIZE] = { + 0x2c, 0xca, 0xa6, 0xb9, 0x5f, 0xfb, 0xb4, 0x1e, 0x98, 0x1b, + 0xf1, 0xdf, 0xdd, 0xcf, 0xde, 0xf4, 0x82, 0xfd, 0x38, 0x59, + 0x2b, 0xa6, 0x10, 0x67, 0xa4, 0x2a, 0xf7, 0x12, 0x89, 0xcc, + 0x53, 0x1c +}; + + +#endif // __SCALE_H__ diff --git a/hwpe/neureka/gen/inc/weight.h b/hwpe/neureka/gen/inc/weight.h new file mode 100644 index 0000000..dfa4788 --- /dev/null +++ b/hwpe/neureka/gen/inc/weight.h @@ -0,0 +1,115 @@ +#ifndef __WEIGHT_H__ +#define __WEIGHT_H__ + +#include + +#define WEIGHT_SIZE (1024) +PI_L1 uint8_t weight[WEIGHT_SIZE] = { + 0x6e, 0xe7, 0xcc, 0xf1, 0x1d, 0x56, 0xfe, 0xa4, 0xdd, 0x6c, + 0x22, 0xe7, 0x2b, 0x7d, 0x5a, 0xc1, 0x16, 0x58, 0x21, 0xc1, + 0x16, 0x95, 0x10, 0x54, 0x9c, 0xdd, 0xc9, 0x66, 0x89, 0x93, + 0xc7, 0xe1, 0x73, 0x89, 0xad, 0x4d, 0x48, 0x2e, 0xe5, 0x3e, + 0x29, 0x36, 0x3c, 0xae, 0xc3, 0xa4, 0x4a, 0x2b, 0x00, 0x54, + 0x42, 0xed, 0x55, 0x7d, 0x54, 0x94, 0x70, 0x1e, 0x3c, 0x46, + 0x86, 0x01, 0xd3, 0xa4, 0xe2, 0x52, 0x0c, 0x87, 0x0b, 0x5e, + 0x02, 0xfb, 0xcd, 0x76, 0xb6, 0x83, 0xc5, 0x9c, 0xf1, 0x6a, + 0x7f, 0x5f, 0xc0, 0x9c, 0x13, 0xde, 0xdd, 0x70, 0xb6, 0x38, + 0x98, 0x87, 0xe5, 0x71, 0xc7, 0x37, 0x9b, 0x47, 0x54, 0xfa, + 0xeb, 0xc7, 0x72, 0x0f, 0x61, 0x72, 0xff, 0x63, 0xc5, 0x59, + 0xb5, 0xeb, 0x0f, 0x38, 0xff, 0x68, 0x18, 0x4e, 0xdb, 0x0a, + 0x0e, 0xc6, 0x77, 0x17, 0xa5, 0x2e, 0x36, 0x00, 0x8e, 0xe2, + 0x41, 0x4f, 0x1e, 0x13, 0x83, 0x6a, 0x9b, 0x08, 0xc1, 0x63, + 0x4e, 0x14, 0xd8, 0xd0, 0x00, 0x8b, 0x06, 0x30, 0xcc, 0x11, + 0x44, 0x09, 0x1e, 0x9a, 0x07, 0xde, 0x4e, 0x6d, 0x45, 0x36, + 0xdd, 0x98, 0x27, 0x32, 0x60, 0xfa, 0xfb, 0x7b, 0x29, 0x73, + 0xe6, 0x6c, 0xec, 0xa1, 0x3f, 0x36, 0x9b, 0x8c, 0xa4, 0x03, + 0x08, 0x60, 0x2b, 0x06, 0xff, 0x89, 0x4d, 0x26, 0x90, 0x0e, + 0x34, 0xcf, 0xf4, 0x1a, 0xd6, 0xa6, 0x55, 0x06, 0xdf, 0x83, + 0x46, 0x0b, 0xec, 0x7a, 0x27, 0xb4, 0x88, 0x8e, 0xae, 0xa4, + 0xad, 0xcd, 0x3e, 0x2a, 0xc7, 0xba, 0xb6, 0x1b, 0x1f, 0xe3, + 0x26, 0x65, 0xc8, 0x2a, 0x0b, 0xaa, 0xfa, 0xf9, 0x5c, 0xe3, + 0x07, 0x50, 0xc9, 0x51, 0x26, 0x10, 0x83, 0xa6, 0x3e, 0x54, + 0x7b, 0xe3, 0x46, 0x79, 0x9a, 0xbe, 0x25, 0xe9, 0x83, 0x2b, + 0xbb, 0xec, 0x29, 0x7c, 0xe9, 0xc7, 0x28, 0x99, 0x1d, 0x7f, + 0xb3, 0x1b, 0x01, 0xa6, 0xe2, 0xed, 0x8b, 0xce, 0x0e, 0xe1, + 0x8a, 0x93, 0x44, 0x13, 0xae, 0xee, 0x19, 0x7d, 0x87, 0xea, + 0xd7, 0x70, 0x9f, 0x3c, 0xbf, 0x2f, 0x5c, 0x60, 0x10, 0xae, + 0x4b, 0x6b, 0x2d, 0x7e, 0xe5, 0x8a, 0xa9, 0xa0, 0xf9, 0x8e, + 0x4f, 0x98, 0x19, 0xf9, 0x3e, 0x4b, 0x0d, 0xf0, 0x81, 0xe0, + 0x52, 0xa8, 0xb5, 0x29, 0x40, 0x0d, 0x71, 0x90, 0x39, 0xe8, + 0x7d, 0x15, 0xf7, 0x21, 0x59, 0x34, 0xf3, 0xfb, 0xc2, 0x51, + 0x37, 0xee, 0x78, 0x32, 0x06, 0xdd, 0xfb, 0xf4, 0x39, 0xb7, + 0x96, 0x1f, 0x17, 0x94, 0x36, 0x2e, 0xb7, 0xd2, 0x6b, 0xa3, + 0x5d, 0x35, 0xcb, 0x76, 0xb2, 0xa9, 0x01, 0xaa, 0xd1, 0xd1, + 0x8a, 0xb7, 0x86, 0x17, 0x21, 0x18, 0xba, 0xa2, 0x38, 0xbe, + 0xc9, 0x64, 0x2d, 0xf4, 0x81, 0xc2, 0xd1, 0x97, 0xbb, 0xcd, + 0x04, 0x50, 0x20, 0xbb, 0x67, 0x20, 0xb4, 0x8a, 0x96, 0x30, + 0x8f, 0x35, 0xa4, 0xb0, 0xc2, 0x8c, 0x4e, 0x9c, 0xc7, 0x2d, + 0xbb, 0x90, 0x2d, 0x4a, 0x96, 0xbb, 0xb9, 0x71, 0x35, 0x37, + 0xe7, 0xc8, 0x54, 0x1c, 0x91, 0x15, 0xb7, 0x46, 0x28, 0x3e, + 0x90, 0x85, 0xa6, 0x09, 0xba, 0xd4, 0x5a, 0x51, 0x5d, 0x4a, + 0xb6, 0xa0, 0xb1, 0x84, 0x12, 0x9d, 0x85, 0x6d, 0xa7, 0x07, + 0x7d, 0x3b, 0x24, 0x44, 0x61, 0xb7, 0x78, 0x41, 0xba, 0x29, + 0x8b, 0xc9, 0x67, 0x2e, 0x54, 0x72, 0x79, 0x11, 0x61, 0xe9, + 0x31, 0x84, 0x21, 0x70, 0x1e, 0xcc, 0x5c, 0x03, 0x75, 0x19, + 0xab, 0x3d, 0xea, 0x1d, 0x15, 0xae, 0xba, 0xe5, 0xa2, 0x0b, + 0x73, 0xf4, 0xa3, 0x55, 0x3c, 0x81, 0x3c, 0x00, 0x30, 0xb0, + 0x2e, 0xca, 0x8f, 0xd8, 0xd3, 0x71, 0x2a, 0x9e, 0x40, 0xaf, + 0x0d, 0x20, 0xa1, 0x22, 0xf8, 0x7e, 0xa4, 0x8d, 0x5a, 0x7a, + 0xfc, 0x23, 0x05, 0x4c, 0xa3, 0xc1, 0x16, 0x95, 0x5e, 0xa9, + 0x6e, 0xbf, 0xf8, 0xaf, 0x5f, 0xe3, 0xb0, 0x22, 0x62, 0x39, + 0x45, 0xc9, 0x47, 0x84, 0x0e, 0x53, 0x79, 0xa2, 0xa8, 0x20, + 0x43, 0x65, 0x63, 0x4a, 0x6d, 0x10, 0x7c, 0x03, 0x5f, 0xd0, + 0xbe, 0x6f, 0x5a, 0x91, 0x41, 0xef, 0x9e, 0x06, 0xc9, 0x43, + 0x1b, 0xd9, 0x6f, 0xe5, 0x69, 0xac, 0xb2, 0xe4, 0x35, 0xea, + 0x8a, 0x8b, 0xb1, 0x5c, 0x20, 0x53, 0x38, 0x40, 0x29, 0xc6, + 0x0e, 0x64, 0xa3, 0xf7, 0xbf, 0x75, 0xad, 0x84, 0x6d, 0x9b, + 0xdd, 0x17, 0x10, 0x3e, 0x85, 0x6a, 0xc2, 0xc2, 0xc1, 0x22, + 0xb0, 0xf7, 0xae, 0x65, 0x43, 0x72, 0x4a, 0xbd, 0xeb, 0xe0, + 0x4e, 0xf9, 0x88, 0xa7, 0x00, 0x02, 0x8d, 0xbc, 0x05, 0x2f, + 0x6c, 0x8d, 0xa5, 0xb6, 0x02, 0x64, 0x32, 0x55, 0x1d, 0xde, + 0x76, 0xc6, 0x54, 0x95, 0xf9, 0xe7, 0xa5, 0x71, 0x66, 0xc0, + 0xe6, 0xab, 0xd7, 0xb7, 0x03, 0xa1, 0x38, 0xc4, 0x6f, 0x90, + 0x0a, 0x87, 0x25, 0x39, 0x42, 0x8f, 0xc7, 0x6b, 0x62, 0x2e, + 0x13, 0xde, 0x07, 0x1e, 0xa7, 0x00, 0xf0, 0x8d, 0xeb, 0xd6, + 0xc2, 0x9e, 0x20, 0x58, 0xe7, 0x7f, 0xc6, 0x20, 0xd6, 0xb1, + 0xfe, 0xf3, 0xee, 0x34, 0x9c, 0x14, 0x1e, 0xf7, 0xa4, 0x18, + 0x2c, 0xdd, 0xfb, 0xde, 0x4a, 0x95, 0xf8, 0xb0, 0x73, 0x14, + 0x09, 0x89, 0x75, 0x4c, 0xe8, 0xc8, 0x00, 0x73, 0xdc, 0x49, + 0x90, 0x58, 0x15, 0x53, 0x22, 0xca, 0x16, 0xc5, 0x49, 0xd5, + 0x2b, 0x98, 0x2f, 0x05, 0x2d, 0x8d, 0x6b, 0x0a, 0x0e, 0xac, + 0xfb, 0x7d, 0x42, 0x12, 0x83, 0xe8, 0x8d, 0x72, 0xa3, 0xf2, + 0xf7, 0x04, 0xc6, 0x87, 0x97, 0xa5, 0x82, 0x5b, 0x85, 0xea, + 0x8f, 0xd3, 0x68, 0x74, 0x3e, 0xf7, 0x36, 0x0c, 0xa0, 0x83, + 0x51, 0x68, 0xf9, 0xfb, 0xbe, 0x10, 0xde, 0xad, 0xec, 0x91, + 0xea, 0x12, 0x2e, 0x4a, 0xf5, 0x35, 0x03, 0x50, 0x3b, 0x5f, + 0x1e, 0xdf, 0xdc, 0x95, 0x68, 0xeb, 0x87, 0x8e, 0x24, 0xa4, + 0x70, 0x13, 0x9e, 0x09, 0x50, 0xa5, 0x15, 0x78, 0x7f, 0x18, + 0x06, 0x61, 0x49, 0x8a, 0xb4, 0x65, 0xca, 0x33, 0xe2, 0xa7, + 0x57, 0x51, 0xa1, 0xcf, 0x58, 0x82, 0xa5, 0xa0, 0xab, 0xd1, + 0x5f, 0xf5, 0x68, 0x2c, 0xad, 0x4e, 0xe5, 0xd1, 0x96, 0x4f, + 0xe0, 0x51, 0x65, 0x0c, 0x44, 0xb8, 0xc6, 0x5b, 0xc8, 0x0e, + 0xe5, 0x2a, 0x7e, 0x74, 0xd7, 0x45, 0x7b, 0xb5, 0x03, 0x4a, + 0x77, 0x7f, 0x8e, 0xda, 0x26, 0x5c, 0xe3, 0x58, 0xbc, 0x36, + 0xa5, 0x6e, 0xa3, 0x7e, 0x87, 0xb9, 0x80, 0xd6, 0x93, 0x5e, + 0xf5, 0x24, 0xd4, 0x2d, 0xbc, 0x4d, 0xc8, 0xe7, 0xdc, 0x85, + 0x44, 0x00, 0x4b, 0x8b, 0x2d, 0x05, 0x32, 0x97, 0x4c, 0x9c, + 0xb9, 0xb4, 0x97, 0x97, 0x55, 0x60, 0x0b, 0x98, 0x21, 0x4f, + 0xa2, 0x2e, 0x4d, 0xcc, 0x5b, 0x8d, 0xde, 0x25, 0xaa, 0x0a, + 0xfc, 0xca, 0xd1, 0x54, 0x90, 0xbe, 0xef, 0xe6, 0x93, 0x49, + 0xbb, 0x33, 0xaf, 0xf9, 0x5b, 0x98, 0xc4, 0xce, 0x3c, 0x22, + 0xd0, 0xed, 0x5a, 0x9e, 0x01, 0xc0, 0x6a, 0x76, 0x2a, 0x74, + 0x8f, 0x65, 0xdd, 0xec, 0x7b, 0xd8, 0xef, 0x0e, 0xc0, 0x0d, + 0xdb, 0x47, 0x8b, 0x6a, 0x0d, 0xc5, 0xe5, 0x53, 0xb2, 0x22, + 0x84, 0x71, 0xb2, 0xf6, 0x99, 0x42, 0x14, 0x43, 0x94, 0xad, + 0x61, 0xbd, 0x00, 0xef, 0xf7, 0xfd, 0x20, 0xd7, 0x0c, 0x67, + 0xf7, 0xca, 0xdf, 0x78, 0xb0, 0x28, 0x95, 0xb9, 0x0e, 0x4c, + 0x3c, 0xbb, 0x2b, 0x85, 0xa3, 0x68, 0x78, 0x6f, 0x31, 0xe7, + 0x5f, 0x7b, 0xa7, 0x5f, 0xe7, 0x17, 0x35, 0xdd, 0x20, 0x2c, + 0x10, 0x3a, 0x98, 0xe3, 0xb8, 0x7a, 0x8e, 0x6f, 0xee, 0x34, + 0x14, 0x75, 0x19, 0xf8 +}; + + + +#endif // __WEIGHT_H__ diff --git a/hwpe/neureka/inc/ecc_check.h b/hwpe/neureka/inc/ecc_check.h new file mode 100644 index 0000000..3f9783d --- /dev/null +++ b/hwpe/neureka/inc/ecc_check.h @@ -0,0 +1,27 @@ +/* + * Copyright 2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __ECC_CHECK_H__ +#define __ECC_CHECK_H__ + +#include + +#define ECC_REGS (4) +extern uint32_t ecc_errs[ECC_REGS]; + +#endif // __ECC_CHECK_H__ diff --git a/hwpe/neureka/inc/layer_util.h b/hwpe/neureka/inc/layer_util.h new file mode 100644 index 0000000..e44ede9 --- /dev/null +++ b/hwpe/neureka/inc/layer_util.h @@ -0,0 +1,40 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __LAYER_UTIL_H__ +#define __LAYER_UTIL_H__ + +#include "layer_conf.h" +#include + +static void layer_info() { + printf("Layer info:\n" + " - input: (%dx%dx%d)\n" + " - output: (%dx%dx%d)\n" + " - weight: (%dx%dx%dx%d)\n" + " - stride: (%dx%d)\n" + " - padding: (%dx%dx%dx%d)\n", + INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, + OUTPUT_CHANNEL, WEIGHT_CHANNEL_OUT, WEIGHT_HEIGHT, WEIGHT_WIDTH, + WEIGHT_CHANNEL_IN, STRIDE_HEIGHT, STRIDE_WIDTH, PADDING_TOP, + PADDING_BOTTOM, PADDING_LEFT, PADDING_RIGHT); +} + +#endif // __LAYER_UTIL_H__ diff --git a/hwpe/neureka/inc/nnx_layer.h b/hwpe/neureka/inc/nnx_layer.h new file mode 100644 index 0000000..cbaf4b9 --- /dev/null +++ b/hwpe/neureka/inc/nnx_layer.h @@ -0,0 +1,26 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NNX_LAYER_H__ +#define __NNX_LAYER_H__ + +void execute_nnx_layer(void *unused_args); + +#endif // __NNX_LAYER_H__ diff --git a/hwpe/neureka/inc/pmsis.h b/hwpe/neureka/inc/pmsis.h new file mode 100644 index 0000000..dbb6a91 --- /dev/null +++ b/hwpe/neureka/inc/pmsis.h @@ -0,0 +1,11 @@ +// fake pmsis.h +#include +#include +// fake data in L2, actually in L1! +#ifndef PI_L1 + #define PI_L1 __attribute__((section(".data_l1"))) +#endif +#ifndef PI_L2 + #define PI_L2 __attribute__((section(".data_l1"))) +#endif + diff --git a/hwpe/neureka/pulp-nnx b/hwpe/neureka/pulp-nnx new file mode 160000 index 0000000..4becda2 --- /dev/null +++ b/hwpe/neureka/pulp-nnx @@ -0,0 +1 @@ +Subproject commit 4becda239309035887db4f7fdf54a63ac8463180 diff --git a/hwpe/neureka/pulp_inject_fault.tcl b/hwpe/neureka/pulp_inject_fault.tcl new file mode 100644 index 0000000..fa7efb2 --- /dev/null +++ b/hwpe/neureka/pulp_inject_fault.tcl @@ -0,0 +1,53 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 +# +# Author: Michael Rogenmoser (michaero@iis.ee.ethz.ch) + +transcript quietly +if {! [info exists ::env(VSIM_PATH)]} {error "Define VSIM_PATH"} +set utils_base_path [file join $::env(VSIM_PATH) scripts fault_injection_utils] +set script_base_path [file join $::env(VSIM_PATH) fault_injection_sim scripts] + +set verbosity 2 +set log_injections 1 +# Easy way to generate a variable seed +# set seed [clock seconds] +# Default value +set seed 12345 +set print_statistics 1 + +set inject_start_time 80000000000ps +set inject_stop_time 150000000000ps +set injection_clock "pulp_cluster_tb/cluster_i/clk_i" +set injection_clock_trigger 0 +set fault_period 100 +set rand_initial_injection_phase 1 +# max_num set to 0 means until stop_time +set max_num_fault_inject 0 +set signal_fault_duration 20ns +set register_fault_duration 0ns + +set allow_multi_bit_upset $::env(MULTI_BIT_UPSET) +set use_bitwidth_as_weight 0 +set check_core_output_modification 0 +set check_core_next_state_modification 0 +set reg_to_sig_ratio 1 + +source [file join $utils_base_path pulp_extract_nets.tcl] + +set inject_signals_netlist [] +set inject_register_netlist [] +set output_netlist [] +set next_state_netlist [] +set assertion_disable_list [] + +# for {set idx 0} {$idx < 12} {incr idx} { +# set inject_signals_netlist [list {*}$inject_signals_netlist {*}[get_all_core_nets $idx]] +# set output_netlist [list {*}$output_netlist {*}[get_core_output_nets $idx]] +# } + +set inject_register_netlist [list {*}$inject_register_netlist {*}[get_memory_slice {0 16} {0 50}]] + +source [file join $script_base_path inject_fault.tcl] + diff --git a/hwpe/neureka/src/main.c b/hwpe/neureka/src/main.c new file mode 100644 index 0000000..d0133cf --- /dev/null +++ b/hwpe/neureka/src/main.c @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2020-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Authors: Francesco Conti + * Gianna Paulin + * Renzo Andri + * Arpan Suravi Prasad + * Luka Macan + * Main Test Program for N-EUREKA + */ + +#include +#include +#include + +#include "layer_util.h" +#include "nnx_layer.h" +#ifndef NO_ECC +#include "ecc_check.h" +#endif + +#define OUTPUT_SIZE 512 + +extern uint8_t output[]; + +#ifndef NO_ECC +uint32_t ecc_errs[ECC_REGS]; +#endif + +static int check_output() { + uint32_t checksum = 0; + for (int i = 0; i < OUTPUT_SIZE; i++) { + checksum += output[i]; + } + return (checksum != 0x00007330); +} + +int errors = 0; +#ifndef NO_ECC +unsigned int intc_data_correctable_cnt = 0; +unsigned int intc_meta_correctable_cnt = 0; +unsigned int intc_data_uncorrectable_cnt = 0; +unsigned int intc_meta_uncorrectable_cnt = 0; +#endif + +int main() { + + unsigned int core_id = get_core_id(); + unsigned int cluster_id = rt_cluster_id(); + + if (core_id == 0) { + + // execute NNX layer + execute_nnx_layer(NULL); + + errors = check_output(); + + *(int *) 0x1A1040A0 = errors; + if(errors) + printf ("[KO] Terminated test with errors!!!\n"); + else + printf ("[OK] Terminated test with no errors!!!\n"); + + #ifndef NO_ECC + // Check number of detected errors by ECC modules inside interconnect + intc_data_correctable_cnt = hwpe_hci_ecc_get_data_correctable_count(cluster_id); + intc_meta_correctable_cnt = hwpe_hci_ecc_get_meta_correctable_count(cluster_id); + intc_data_uncorrectable_cnt = hwpe_hci_ecc_get_data_uncorrectable_count(cluster_id); + intc_meta_uncorrectable_cnt = hwpe_hci_ecc_get_meta_uncorrectable_count(cluster_id); + for (int i = 0; i < 16; i++) { + intc_meta_correctable_cnt += tcdm_scrubber_get_mismatch_count(cluster_id, i); + } + + printf("Data errors corrected inside Neureka: %d. Data errors uncorrectable inside Neureka: %d\n", + ecc_errs[0], ecc_errs[1]); + printf("Meta errors corrected inside Neureka: %d. Meta errors uncorrectable inside Neureka: %d\n", + ecc_errs[2], ecc_errs[3]); + + printf("Data errors corrected inside intc: %d. Data errors uncorrectable inside intc: %d\n", + intc_data_correctable_cnt, intc_data_uncorrectable_cnt); + printf("Meta errors corrected inside intc: %d. Meta errors uncorrectable inside intc: %d\n", + intc_meta_correctable_cnt, intc_meta_uncorrectable_cnt); + #endif + } + synch_barrier(); + #ifndef NO_ECC + return (errors != 0) && (intc_data_uncorrectable_cnt == 0 && intc_meta_uncorrectable_cnt == 0 && (ecc_errs[1]==0 && ecc_errs[3]==0)); + #else + return errors; + #endif +} diff --git a/hwpe/neureka/src/nnx_layer.c b/hwpe/neureka/src/nnx_layer.c new file mode 100644 index 0000000..06a7dd4 --- /dev/null +++ b/hwpe/neureka/src/nnx_layer.c @@ -0,0 +1,176 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "nnx_layer.h" +#include "ecc_check.h" +#include "pulp.h" + +#include "neureka.h" +#include "neureka_pulp_cluster_bsp.h" +#include "neureka_task.h" +#include "pulp_nnx_neureka.h" + +#define NULL 0 + +typedef neureka_norm_mode_e nnx_norm_mode_e; +typedef neureka_quant_t nnx_quant_t; +typedef neureka_quant_function_e nnx_quant_function_e; +typedef neureka_norm_t nnx_norm_t; +typedef neureka_task_t nnx_task_t; +typedef neureka_dev_t nnx_dev_t; +typedef neureka_pulp_cluster_conf_t nnx_bsp_conf_t; +typedef neureka_task_flag_e nnx_task_flag_e; + +#define nnxTaskFlagTrue neurekaTaskFlagTrue +#define nnxTaskFlagFalse neurekaTaskFlagFalse + +#define nnx_task_init neureka_task_init +#define nnx_task_set_op_to_conv neureka_task_set_op_to_conv +#define nnx_task_set_bits neureka_task_set_bits +#define nnx_task_set_norm_quant neureka_task_set_norm_quant +#define nnx_task_set_weight_offset neureka_task_set_weight_offset +#define nnx_task_set_weight_source neureka_task_set_weight_source +#define nnx_task_set_activation_prefetch neureka_task_set_activation_prefetch +#define nnx_task_set_dims neureka_task_set_dims +#define nnx_task_set_ptrs_conv neureka_task_set_ptrs_conv +#define nnx_task_set_ptrs_norm_quant neureka_task_set_ptrs_norm_quant + +#define nnx_bsp_get_dev neureka_pulp_cluster_get_dev + +#define nnx_init neureka_nnx_init +#define nnx_dispatch_wait neureka_nnx_dispatch_wait +#define nnx_dispatch neureka_nnx_dispatch +#define nnx_resolve_wait neureka_nnx_resolve_wait +#define nnx_read_ecc_regs neureka_nnx_read_ecc_regs +#define nnx_term neureka_nnx_term + +// Generated headers +#include "layer_conf.h" +#if HAS_BIAS != 0 + #include "bias.h" +#endif +#include "input.h" +#include "output.h" +#if HAS_NORM_QUANT != 0 + #include "scale.h" +#endif +#include "weight.h" + +static void task_prepare(nnx_task_t *task) { + nnx_task_init(task); + nnx_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1); + nnx_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS); + + nnx_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET); + +#ifdef NEUREKA_WEIGHT_SOURCE_WMEM + nnx_task_set_weight_source(task, neurekaWeightSourceWmem); + nnx_task_set_activation_prefetch(task, activationPrefetchOn); +#else + neureka_task_set_weight_source(task, neurekaWeightSourceTcdm); + nnx_task_set_activation_prefetch(task, activationPrefetchOff); +#endif +#if INPUT_SIGNED == 1 + neureka_task_set_input_signed(task); +#else + neureka_task_set_input_unsigned(task); +#endif + + const uint32_t w_in_stride = INPUT_CHANNEL * INPUT_BITS / 8; + const uint32_t h_in_stride = INPUT_WIDTH * w_in_stride; + const uint32_t w_out_stride = OUTPUT_CHANNEL * OUTPUT_BITS / 8; + const uint32_t h_out_stride = OUTPUT_WIDTH * w_out_stride; + +#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2 + nnx_task_set_dims_stride2x2( + task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride, + OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, w_out_stride, + WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM, PADDING_LEFT, + PADDING_RIGHT); +#else + nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride, + OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, + w_out_stride, PADDING_TOP, PADDING_BOTTOM, PADDING_LEFT, + PADDING_RIGHT); +#endif + + nnx_task_set_ptrs_conv(task, (uint32_t)input, INPUT_WIDTH, w_in_stride, + PADDING_TOP, PADDING_LEFT, (uint32_t)output, + (uint32_t)weight); +#if HAS_NORM_QUANT == 1 +#if SCALE_BITS == 8 + const nnx_norm_mode_e normMode = normMode8Bit; +#elif SCALE_BITS == 32 + const nnx_norm_mode_e normMode = normMode32Bit; +#endif + + const nnx_task_flag_e flag_bias = + HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse; + const uint32_t bias_ptr = (uint32_t)(HAS_BIAS ? bias : NULL); + + nnx_quant_function_e quant_function = + HAS_RELU ? quantFunctionRelu : quantFunctionIdentity; + + nnx_task_set_norm_quant(task, + (nnx_quant_t){.shift_amount = OUTSHIFT, + .function = quant_function, + .flag_rounding = nnxTaskFlagFalse}, + (nnx_norm_t){.mode = normMode, + .flag_bias = flag_bias, + .flag_shift = nnxTaskFlagFalse}); + + nnx_task_set_ptrs_norm_quant(task, (uint32_t)scale, NULL, bias_ptr); +#endif // HAS_NORM_QUANT +} + +static void task_execute(nnx_task_t *task) { + nnx_dev_t *dev = nnx_bsp_get_dev(); + + nnx_bsp_conf_t conf = {.max_stall = 8}; + nnx_init(dev, &conf); + + nnx_dispatch_wait(dev); + + // printf("CFG:\n"); + // for (int i=0; idata)[i]); + // } +#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2 + nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT, + OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, + WEIGHT_WIDTH); +#else + nnx_dispatch(dev, task); +#endif + + nnx_resolve_wait(dev, task); +#ifndef NO_ECC + nnx_read_ecc_regs(dev, (uint32_t)ecc_errs); +#endif + + nnx_term(dev); + +} + +void execute_nnx_layer(void *args) { + nnx_task_t task; + task_prepare(&task); + task_execute(&task); +} diff --git a/hwpe/redmule/Makefile b/hwpe/redmule/Makefile new file mode 100644 index 0000000..2d118a6 --- /dev/null +++ b/hwpe/redmule/Makefile @@ -0,0 +1,23 @@ +PULP_APP = test +PULP_APP_SRCS = redmule.c +PULP_CFLAGS = -O3 + +ifeq ($(use_dma),1) + PULP_CFLAGS += -DUSE_DMA +endif +ifeq ($(no_ecc),1) + PULP_CFLAGS += -DNO_ECC +endif + +ifeq ($(fault_inject),1) + export FAULT_INJECTION=1 + export FAULT_INJECTION_SCRIPT=$(CURDIR)/pulp_inject_fault.tcl +endif + +ifeq ($(multi_bit_upset),1) + export MULTI_BIT_UPSET=1 +else + export MULTI_BIT_UPSET=0 +endif + +include $(PULP_SDK_HOME)/install/rules/pulp_rt.mk diff --git a/hwpe/redmule/archi_redmule.h b/hwpe/redmule/archi_redmule.h new file mode 100644 index 0000000..40eceee --- /dev/null +++ b/hwpe/redmule/archi_redmule.h @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2022-2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Yvan Tortorella + * + * High-level architecture of RedMulE + * + */ + +#ifndef __ARCHI_REDMULE_H__ +#define __ARCHI_REDMULE_H__ + +/* + * |========================================================================| + * || || + * ||Control and generic configuration register layout || + * |========================================================================| + * || # reg | offset | bits | bitmask || content || + * ||-------+----------+---------+--------------++-------------------------|| + * || 0 | 0x0000 | 31: 0 | 0xFFFFFFFF || TRIGGER || + * || 1 | 0x0004 | 31: 0 | 0xFFFFFFFF || ACQUIRE || + * || 2 | 0x0008 | 31: 0 | 0xFFFFFFFF || EVT_ENABLE || + * || 3 | 0x000c | 31: 0 | 0xFFFFFFFF || STATUS || + * || 4 | 0x0010 | 31: 0 | 0xFFFFFFFF || RUNNING_JOB || + * || 5 | 0x0014 | 31: 0 | 0xFFFFFFFF || SOFT_CLEAR || + * |========================================================================| + * || || + * ||Job-dependent registers layout || + * |========================================================================| + * || # reg | offset | bits | bitmask || content || + * ||-------+----------+---------+--------------++-------------------------|| + * || 0 | 0x0040 | 31: 0 | 0xFFFFFFFF || X_ADDR || + * ||-------+----------+---------+--------------++-------------------------|| + * || 1 | 0x0044 | 31: 0 | 0xFFFFFFFF || W_ADDR || + * ||-------+----------+---------+--------------++-------------------------|| + * || 2 | 0x0048 | 31: 0 | 0xFFFFFFFF || Z_ADDR || + * ||-------+----------+---------+--------------++-------------------------|| + * || 3 | 0x004C | | || Matrix Config 0 Reg || + * || | | 31:16 | 0xFFFF0000 || K Size (W Columns) || + * || | | 15: 0 | 0x0000FFFF || M Size (X Rows) || + * ||-------+----------+---------+--------------++-------------------------|| + * || 4 | 0x0050 | | || Matrix Config 1 Reg || + * || | | 31:16 | 0xFFFFFFFF || N Size (X Cols/W Rows) || + * ||-------+----------+---------+--------------++-------------------------|| + * || 5 | 0x0054 | | || Matrix Arithmetic Reg || + * || | | 12:10 | 0x00001C00 || Operation selection || + * || | | 9: 7 | 0x00000380 || Input/Output format || + * |========================================================================| + * + */ + +/* PULP Cluster Archi defines */ +#define ARCHI_CLUST_CTRL_BASE ARCHI_CLUSTER_CTRL_ADDR +#define ARCHI_CLUST_HWPE_BASE ARCHI_HWCE_ADDR +#define DMA_COMMAND_QUEUE ARCHI_MCHAN_DEMUX_ADDR +#define DMA_STATUS_REGISTER (ARCHI_MCHAN_DEMUX_ADDR + 4) +#define ARCHI_CL_HWPE_EVT0 12 +#define ARCHI_CL_HWPE_EVT1 13 +#define FC_DMA_EVENT 8 +#define CL_DMA_EVENT 22 +#define CLUST_CTRL_HWPE_EN 0x18 +#define CLUST_CTRL_HWPE_EN_MASK 0x800 +#define __builtin_bitinsert(a,b,c,d) (a | (((b << (32-c)) >> (32-c)) << d)) + +// RedMulE architecture +#define ADDR_WIDTH 32 +#define DATA_WIDTH 256 +#define REDMULE_FMT 16 +#define ARRAY_HEIGHT 4 +#define PIPE_REGS 3 +#define ARRAY_WIDTH 12 /* Superior limit is ARRAY_HEIGHT*PIPE_REGS */ + +// Commands +#define REDMULE_TRIGGER 0x00 +#define REDMULE_ACQUIRE 0x04 +#define REDMULE_FINISHED 0x08 +#define REDMULE_STATUS 0x0C +#define REDMULE_RUNNING_JOB 0x10 +#define REDMULE_SOFT_CLEAR 0x14 + +// Registers +#define REDMULE_REG_OFFS 0x40 +// #define REDMULE_REG_X_PTR 0x00 +// #define REDMULE_REG_W_PTR 0x04 +// #define REDMULE_REG_Z_PTR 0x08 +// #define REDMULE_MCFG0_PTR 0x0C +// #define REDMULE_MCFG1_PTR 0x10 +// #define REDMULE_ARITH_PTR 0x14 +#define REDMULE_REG_X_PTR 0x00 +#define REDMULE_REG_W_PTR 0x04 +#define REDMULE_REG_Y_PTR 0x08 +#define REDMULE_REG_Z_PTR 0x0C +#define REDMULE_REG_X_ITER_PTR 0x10 +#define REDMULE_REG_W_ITER_PTR 0x14 +#define REDMULE_REG_LEFTOVERS_PTR 0x18 +#define REDMULE_REG_LEFT_PARAMS_PTR 0x1C +#define REDMULE_REG_X_D1_STRIDE_PTR 0x20 +#define REDMULE_REG_W_TOT_LEN_PTR 0x24 +#define REDMULE_REG_TOT_X_READ_PTR 0x28 +#define REDMULE_REG_W_D0_STRIDE_PTR 0x2C +#define REDMULE_REG_YZ_TOT_LEN_PTR 0x30 +#define REDMULE_REG_YZ_D0_STRIDE_PTR 0x34 +#define REDMULE_REG_YZ_D2_STRIDE_PTR 0x38 +#define REDMULE_REG_X_ROWS_OFFS_PTR 0x3C +#define REDMULE_REG_X_BUFFER_SLOTS_PTR 0x40 +#define REDMULE_REG_X_TOT_LEN_PTR 0x44 +#define REDMULE_REG_OP_SELECTION 0x48 + +#define REDMULE_ECC_REG_OFFS 0x90 +#define DATA_CORR_ERR 0x00 +#define DATA_UNCORR_ERR 0x04 +#define METADATA_CORR_ERR 0x08 +#define METADATA_UNCORR_ERR 0x0c + +// OPs definition +#define MATMUL 0x0 +#define GEMM 0x1 +#define ADDMAX 0x2 +#define ADDMIN 0x3 +#define MULMAX 0x4 +#define MULMIN 0x5 +#define MAXMIN 0x6 +#define MINMAX 0x7 + +// GEMM formats +#define Float8 0x0 +#define Float16 0x1 +#define Float8Alt 0x2 +#define Float16Alt 0x3 + +#define RNE 0x0 +#define RTZ 0x1 +#define OP_FMADD 0x0 +#define OP_ADD 0x2 +#define OP_MUL 0x3 +#define OP_MINMAX 0x7 + +// FP Formats encoding +#define FP16 0x2 +#define FP8 0x3 +#define FP16ALT 0x4 +#define FP8ALT 0x5 + +/* DMA Archi */ +#define DMA_TX 0 +#define DMA_RX 1 +#define DMA_INC 1 + +#define PLP_DMA_TYPE_BIT 0x00000011 +#define PLP_DMA_INCR_BIT 0x00000012 +#define PLP_DMA_2D_BIT 0x00000013 +#define PLP_DMA_ELE_BIT 0x00000014 +#define PLP_DMA_ILE_BIT 0x00000015 +#define PLP_DMA_BLE_BIT 0x00000016 +#define PLP_DMA_2D_TCDM_BIT 0x0000017 + +#endif diff --git a/hwpe/redmule/hal_redmule.h b/hwpe/redmule/hal_redmule.h new file mode 100644 index 0000000..8fc5000 --- /dev/null +++ b/hwpe/redmule/hal_redmule.h @@ -0,0 +1,556 @@ +/* + * Copyright (C) 2022-2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Yvan Tortorella + * + * RedMulE Hardware Abstraction Layer (HAL) + */ + +#ifndef __HAL_REDMULE_H__ +#define __HAL_REDMULE_H__ + +#include +#include "inc/x_input.h" +#include "inc/w_input.h" +#include "inc/y_input.h" +#include "inc/z_output.h" +#include "inc/golden.h" +#include "inc/tensor_dim.h" + +/* + * + * For control, generic configuration register layout, + * and job-dependent register map, look at redmule_archi.h + * + */ + +// For all the following functions we use __builtin_pulp_OffsetedWrite and __builtin_pulp_OffsetedRead +// instead of classic load/store because otherwise the compiler is not able to correctly factorize +// the HWPE base in case several accesses are done, ending up with twice more code + +#define HWPE_WRITE(value, offset) *(int *)(ARCHI_CLUST_HWPE_BASE + offset) = value +#define HWPE_READ(offset) *(int *)(ARCHI_CLUST_HWPE_BASE + offset) + +static inline void redmule_x_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_X_PTR); +} + +static inline void redmule_w_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_W_PTR); +} + +static inline void redmule_y_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_Y_PTR); +} + +static inline void redmule_z_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_Z_PTR); +} + +// static inline void redmule_mcfg_set (uint32_t mcfg0, uint32_t mcfg1) { +// HWPE_WRITE(mcfg0, REDMULE_REG_OFFS + REDMULE_MCFG0_PTR); +// HWPE_WRITE(mcfg1, REDMULE_REG_OFFS + REDMULE_MCFG1_PTR); +// } +// +// static inline void redmule_arith_set (uint32_t arith) { +// HWPE_WRITE(arith, REDMULE_REG_OFFS + REDMULE_ARITH_PTR); +// } + +static inline void hwpe_trigger_job() { + HWPE_WRITE(0, REDMULE_TRIGGER); +} + +static inline int hwpe_acquire_job() { + return HWPE_READ(REDMULE_ACQUIRE); +} + +static inline unsigned int hwpe_get_status() { + return HWPE_READ(REDMULE_STATUS); +} + +static inline unsigned int hwpe_get_running_job() { + return HWPE_READ(REDMULE_RUNNING_JOB); +} + +static inline void hwpe_soft_clear() { + HWPE_WRITE(0, REDMULE_SOFT_CLEAR); +} + +static inline void hwpe_cg_enable() { + *(volatile int*) (ARCHI_CLUST_CTRL_BASE + CLUST_CTRL_HWPE_EN) |= CLUST_CTRL_HWPE_EN_MASK; +} + +static inline void hwpe_cg_disable() { + *(volatile int*) (ARCHI_CLUST_CTRL_BASE + CLUST_CTRL_HWPE_EN) &= ~CLUST_CTRL_HWPE_EN_MASK; +} + +static inline void redmule_evt_wait() { + do { + eu_evt_maskWaitAndClr (1 << ARCHI_CL_HWPE_EVT0); + } while((*(int volatile *)(ARCHI_CLUST_HWPE_BASE + REDMULE_STATUS)) != 0); +} + +static inline int hwpe_wait_acquire() { + int job_id = hwpe_acquire_job(); + while(job_id < 0) { + eu_evt_maskWaitAndClr (1 << ARCHI_CL_HWPE_EVT0); + job_id = hwpe_acquire_job(); + } + return job_id; +} + +static inline unsigned int redmule_get_data_correctable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + DATA_CORR_ERR); +} + +static inline unsigned int redmule_get_data_uncorrectable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + DATA_UNCORR_ERR); +} + +static inline unsigned int redmule_get_meta_correctable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + METADATA_CORR_ERR); +} + +static inline unsigned int redmule_get_meta_uncorrectable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + METADATA_UNCORR_ERR); +} + +/* DMA APIs */ +static inline int mchan_alloc(){ + return *(volatile int*) DMA_COMMAND_QUEUE; +} + +static inline void mchan_transfer(unsigned int len, + unsigned int ext_addr, + unsigned int tcdm_addr) { + + *(volatile int*) DMA_COMMAND_QUEUE = len | + (DMA_RX << PLP_DMA_TYPE_BIT) | + (DMA_INC << PLP_DMA_INCR_BIT) | + (0 << PLP_DMA_2D_BIT) | + (1 << PLP_DMA_ELE_BIT) | + (1 << PLP_DMA_ILE_BIT) | + (0 << PLP_DMA_BLE_BIT) | + (0 << PLP_DMA_2D_TCDM_BIT); + *(volatile int*) DMA_COMMAND_QUEUE = tcdm_addr; + *(volatile int*) DMA_COMMAND_QUEUE = ext_addr; +} + +static inline void mchan_barrier(int id) { + while(((*(volatile int*)(DMA_STATUS_REGISTER)) >> id ) & 0x1 ) { + eu_evt_maskWaitAndClr(1 << FC_DMA_EVENT); + } +} + +static inline void mchan_free(int id) { + *(volatile int*) DMA_STATUS_REGISTER = 0x1 << id; +} + +// void redmule_cfg (unsigned int x, unsigned int w, unsigned int z, +// uint16_t m_size, uint16_t n_size, uint16_t k_size, +// uint8_t gemm_op, uint8_t gemm_fmt){ +// +// uint32_t mcfg_reg0 = 0; +// uint32_t mcfg_reg1 = 0; +// uint32_t arith_reg = 0; +// +// mcfg_reg0 = (k_size << 16) | +// (m_size << 0); +// mcfg_reg1 = n_size << 0; +// +// arith_reg = (gemm_op << 10) | +// (gemm_fmt << 7); +// +// redmule_x_add_set ((unsigned int) x); +// redmule_w_add_set ((unsigned int) w); +// redmule_z_add_set ((unsigned int) z); +// redmule_mcfg_set ((unsigned int) mcfg_reg0, +// (unsigned int) mcfg_reg1); +// redmule_arith_set ((unsigned int) arith_reg); +// +// } + +void redmule_cfg (uint16_t m_size, uint16_t n_size, uint16_t k_size, uint8_t gemm_ops){ + uint32_t x_iters = 0; + uint32_t w_iters = 0; + uint32_t leftovers = 0; + uint32_t left_params = 0; + uint32_t x_d1_stride = 0; + uint32_t x_rows_offs = 0; + uint32_t w_tot_len = 0; + uint32_t w_d1_len = 0; + uint32_t w_d0_stride = 0; + uint32_t yz_tot_len = 0; + uint32_t yz_d0_stride = 0; + uint32_t yz_d2_stride = 0; + uint32_t tot_x_read = 0; + uint32_t x_buffer_slots = 0; + uint32_t op_selection = 0; + uint16_t tot_stores = 0; + uint16_t w_rows = n_size; + uint16_t depth = DATA_WIDTH/(ARRAY_HEIGHT*FPFORMAT); + uint8_t tile = ARRAY_HEIGHT*(PIPE_REGS + 1); + _Bool x_rows_sub = 0; + _Bool x_cols_sub = 0; + _Bool w_cols_sub = 0; + uint16_t x_rows_iter, + x_rows_iter_tmp, + w_rows_iter, + w_rows_iter_tmp; + uint16_t x_cols_iter, + x_cols_iter_tmp, + w_cols_iter, + w_cols_iter_tmp; + uint8_t x_rows_lftovr, + x_cols_lftovr, + w_rows_lftovr, + w_cols_lftovr, + slots; + + // Calculating the number of iterations alng the two dimensions of the X matrix + x_rows_iter_tmp = m_size/ARRAY_WIDTH; + x_cols_iter_tmp = n_size/tile; + + // Calculating the number of iterations alng the two dimensions of the W matrix + w_rows_iter_tmp = w_rows; + w_cols_iter_tmp = k_size/tile; + + // Calculating the residuals along the input dimensions + x_rows_lftovr = m_size - (x_rows_iter_tmp*ARRAY_WIDTH); + x_cols_lftovr = n_size - (x_cols_iter_tmp*tile); + + // Calculating the residuals along the weight dimensions + w_rows_lftovr = n_size - (ARRAY_HEIGHT*(w_rows/ARRAY_HEIGHT)); + w_cols_lftovr = k_size - (w_cols_iter_tmp*tile); + + if (w_cols_lftovr != 0) + w_cols_iter = w_cols_iter_tmp + 1; + else + w_cols_iter = w_cols_iter_tmp; + + if (w_rows_lftovr != 0) + w_rows_iter = w_rows_iter_tmp + ARRAY_HEIGHT - w_rows_lftovr; + else + w_rows_iter = w_rows_iter_tmp; + + if (x_cols_lftovr != 0) + x_cols_iter = x_cols_iter_tmp + 1; + else + x_cols_iter = x_cols_iter_tmp; + + if (x_rows_lftovr != 0) + x_rows_iter = x_rows_iter_tmp + 1; + else + x_rows_iter = x_rows_iter_tmp; + + if (x_cols_lftovr%depth != 0) + x_buffer_slots = x_cols_lftovr/depth + 1; + else + x_buffer_slots = x_cols_lftovr/depth; + + // Calculating the number of total stores + tot_stores = x_rows_iter*w_cols_iter; + + // Determining if input matrixes are sub-matrixes + if (m_size < ARRAY_WIDTH) + x_rows_sub = 1; + if (n_size < ARRAY_HEIGHT) + x_cols_sub = 1; + if (k_size < tile) + w_cols_sub = 1; + + // Operation selection + switch (gemm_ops) { + case MATMUL: + op_selection |= (RNE << 29 | RNE << 26 | OP_FMADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 0; + break; + + case GEMM: + op_selection |= (RNE << 29 | RNE << 26 | OP_FMADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case ADDMAX: + op_selection |= (RNE << 29 | RTZ << 26 | OP_ADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case ADDMIN: + op_selection |= (RNE << 29 | RNE << 26 | OP_ADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MULMAX: + op_selection |= (RNE << 29 | RTZ << 26 | OP_MUL << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MULMIN: + op_selection |= (RNE << 29 | RNE << 26 | OP_MUL << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MAXMIN: + op_selection |= (RTZ << 29 | RNE << 26 | OP_MINMAX << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MINMAX: + op_selection |= (RNE << 29 | RTZ << 26 | OP_MINMAX << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + } + + // Storing iterations and residuals in registers + x_iters |= x_rows_iter << 16 | x_cols_iter << 0; + w_iters |= w_rows_iter << 16 | w_cols_iter << 0; + leftovers |= x_rows_lftovr << 24 | x_cols_lftovr << 16 | w_rows_lftovr << 8 | w_cols_lftovr << 0; + left_params |= tot_stores << 16 | x_rows_sub << 15 | x_cols_sub << 14 | w_cols_sub << 13; + x_d1_stride = ((4*FPFORMAT)/ADDR_WIDTH)*(((DATA_WIDTH/FPFORMAT)*x_cols_iter_tmp) + x_cols_lftovr); + x_rows_offs = ARRAY_WIDTH*x_d1_stride; + w_tot_len = w_rows_iter*w_cols_iter*x_rows_iter; + w_d0_stride = ((4*FPFORMAT)/ADDR_WIDTH)*(((DATA_WIDTH/FPFORMAT)*w_cols_iter_tmp) + w_cols_lftovr); + yz_tot_len = ARRAY_WIDTH*x_rows_iter*w_cols_iter; + yz_d0_stride = w_d0_stride; + yz_d2_stride = ARRAY_WIDTH*w_d0_stride; + tot_x_read = x_rows_iter*x_cols_iter*w_cols_iter; + + // Writing the computations in configuration register + HWPE_WRITE(x_iters , REDMULE_REG_OFFS + REDMULE_REG_X_ITER_PTR ); + HWPE_WRITE(w_iters , REDMULE_REG_OFFS + REDMULE_REG_W_ITER_PTR ); + HWPE_WRITE(leftovers , REDMULE_REG_OFFS + REDMULE_REG_LEFTOVERS_PTR ); + HWPE_WRITE(left_params , REDMULE_REG_OFFS + REDMULE_REG_LEFT_PARAMS_PTR ); + HWPE_WRITE(x_d1_stride , REDMULE_REG_OFFS + REDMULE_REG_X_D1_STRIDE_PTR ); + HWPE_WRITE(x_rows_offs , REDMULE_REG_OFFS + REDMULE_REG_X_ROWS_OFFS_PTR ); + HWPE_WRITE(tot_x_read , REDMULE_REG_OFFS + REDMULE_REG_TOT_X_READ_PTR ); + HWPE_WRITE(x_buffer_slots, REDMULE_REG_OFFS + REDMULE_REG_X_BUFFER_SLOTS_PTR ); + HWPE_WRITE(w_tot_len , REDMULE_REG_OFFS + REDMULE_REG_W_TOT_LEN_PTR ); + HWPE_WRITE(w_d0_stride , REDMULE_REG_OFFS + REDMULE_REG_W_D0_STRIDE_PTR ); + HWPE_WRITE(yz_tot_len , REDMULE_REG_OFFS + REDMULE_REG_YZ_TOT_LEN_PTR ); + HWPE_WRITE(yz_d0_stride , REDMULE_REG_OFFS + REDMULE_REG_YZ_D0_STRIDE_PTR ); + HWPE_WRITE(yz_d2_stride , REDMULE_REG_OFFS + REDMULE_REG_YZ_D2_STRIDE_PTR ); + HWPE_WRITE(op_selection , REDMULE_REG_OFFS + REDMULE_REG_OP_SELECTION ); +} + +void generate_test_data16(int x_start_addr, + int w_start_addr, + int y_start_addr, + int m_size, + int n_size, + int k_size) { + + int x_addr = x_start_addr; + int w_addr = w_start_addr; + int y_addr = y_start_addr; + int x_end_addr = x_start_addr + (2*m_size*n_size); + int w_end_addr = w_start_addr + (2*n_size*k_size); + int y_end_addr = y_start_addr + (2*m_size*k_size); + + // Generating input stimuli from golden model + for (x_addr = x_start_addr; x_addr < x_end_addr; x_addr += 2) { + int x = x_addr - x_start_addr; + *(uint32_t *)(x_addr) = x_inp[x/2]; + } + + // Generating Weight stimuli from golden model + for (w_addr = w_start_addr; w_addr < w_end_addr; w_addr += 2) { + int w = w_addr - w_start_addr; + *(uint32_t *)(w_addr) = w_inp[w/2]; + } + + for (y_addr = y_start_addr; y_addr < y_end_addr; y_addr += 2) { + int y = y_addr - y_start_addr; + *(uint32_t *)(y_addr) = y_inp[y/2]; + } +} + +int redmule_compare16 (int z_start_addr, int m_size, int k_size) { + int err = 0; + int z_end_addr = z_start_addr + 2*m_size*k_size; + uint16_t z_computed; + uint16_t diff, diff_1, diff_2; + + for (int z_addr = z_start_addr; z_addr < z_end_addr; z_addr += 2) { + int z = z_addr - z_start_addr; + z_computed = *(uint32_t *)(z_addr); + + if ( z_computed != z_oup[z/2] ) { + diff_1 = z_computed - z_oup[z/2]; + if (diff_1 > 3) { + diff_2 = z_oup[z/2] - z_computed; + if (diff_2 > 3) { + err++; + } + } + } + } + + return err; + +} + +int redmule16_compare_int(uint32_t *actual_z, uint32_t *golden_z, int len) { + #define ERR 0x0011 + uint32_t actual_word = 0; + uint16_t actual_MSHWord, actual_LSHWord; + uint32_t golden_word = 0; + uint16_t golden_MSHWord, golden_LSHWord; + uint32_t actual = 0; + uint32_t golden = 0; + + int errors = 0; + int error; + + for (int i=0; i golden_LSHWord) ? (actual_LSHWord - golden_LSHWord) : 0; + diff = (actual_LSHWord < golden_LSHWord) ? (golden_LSHWord - actual_LSHWord) : 0; + + if (diff > ERR) { + error = 1; + #ifdef VERBOSE + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("LSW: Error!\n"); + #endif + } + + // Checking Most Significant Half-Word + actual_MSHWord = (uint16_t)((actual_word >> 16) & 0x0000FFFF); + golden_MSHWord = (uint16_t)((golden_word >> 16) & 0x0000FFFF); + + diff = (actual_MSHWord > golden_MSHWord) ? (actual_MSHWord - golden_MSHWord) : 0; + diff = (actual_MSHWord < golden_MSHWord) ? (golden_MSHWord - actual_MSHWord) : 0; + + if (diff > ERR) { + error = 1; + #ifdef VERBOSE + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("MSW: Error!\n"); + #endif + } + + errors += error; + + #ifdef DEBUG + tfp_printf(" Golden: 0x%08x; Actual: 0x%08x,\n", golden_word, actual_word); + #endif + + #ifdef VERBOSE + if(error) { + if(errors==1) tfp_printf(" golden <- actual @ address @ index\n"); + tfp_printf(" 0x%08x <- 0x%08x @ 0x%08x @ 0x%08x\n", golden_word, actual_word, (actual_z+i), i*4); + } + #endif + } + return errors; +} + +int redmule8_compare_int(uint32_t *actual_z, uint32_t *golden_z, int len) { + #define ERR 0x0011 + uint32_t actual_word = 0; + uint8_t actual_Byte0, + actual_Byte1, + actual_Byte2, + actual_Byte3; + uint32_t golden_word = 0; + uint8_t golden_Byte0, + golden_Byte1, + golden_Byte2, + golden_Byte3; + uint32_t actual = 0; + uint32_t golden = 0; + + int errors = 0; + int error; + + for (int i=0; i golden_Byte0) ? (actual_Byte0 - golden_Byte0) : 0; + diff = (actual_Byte0 < golden_Byte0) ? (golden_Byte0 - actual_Byte0) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte0: Error!\n"); + } + + // Cheching Byte1 + actual_Byte1 = (uint8_t)( (actual_word >> 8 ) & 0x000000FF); + golden_Byte1 = (uint8_t)( (golden_word >> 8 ) & 0x000000FF); + + diff = (actual_Byte1 > golden_Byte1) ? (actual_Byte1 - golden_Byte1) : 0; + diff = (actual_Byte1 < golden_Byte1) ? (golden_Byte1 - actual_Byte1) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte1: Error!\n"); + } + + // Cheching Byte2 + actual_Byte2 = (uint8_t)( (actual_word >> 16 ) & 0x000000FF); + golden_Byte2 = (uint8_t)( (golden_word >> 16 ) & 0x000000FF); + + diff = (actual_Byte2 > golden_Byte2) ? (actual_Byte2 - golden_Byte2) : 0; + diff = (actual_Byte2 < golden_Byte2) ? (golden_Byte2 - actual_Byte2) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte2: Error!\n"); + } + + // Cheching Byte3 + actual_Byte3 = (uint8_t)( (actual_word >> 24 ) & 0x000000FF); + golden_Byte3 = (uint8_t)( (golden_word >> 24 ) & 0x000000FF); + + diff = (actual_Byte3 > golden_Byte3) ? (actual_Byte3 - golden_Byte3) : 0; + diff = (actual_Byte3 < golden_Byte3) ? (golden_Byte3 - actual_Byte3) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte3: Error!\n"); + } + + errors += error; + + #ifdef DEBUG + tfp_printf(" Golden: 0x%08x; Actual: 0x%08x,\n", golden_word, actual_word); + #endif + + #ifdef VERBOSE + if(error) { + if(errors==1) tfp_printf(" golden <- actual @ address @ index\n"); + tfp_printf(" 0x%08x <- 0x%08x @ 0x%08x @ 0x%08x\n", golden_word, actual_word, (actual_z+i), i*4); + } + #endif + } + return errors; +} + +#endif diff --git a/hwpe/redmule/inc/golden.h b/hwpe/redmule/inc/golden.h new file mode 100644 index 0000000..f664e47 --- /dev/null +++ b/hwpe/redmule/inc/golden.h @@ -0,0 +1,387 @@ + /* Header file generated by RedMulE Golden Model */ +uint32_t golden [384] = { +0x48974845, +0x48384608, +0x487b4855, +0x48804869, +0x48b046d1, +0x483f48db, +0x485f48c9, +0x483a4881, +0x472c484b, +0x492b4762, +0x48fd4822, +0x492e488e, +0x484f483e, +0x46d749e8, +0x489d484b, +0x47e9490b, +0x47d2484f, +0x474744be, +0x46c047c7, +0x48af4727, +0x482d46c5, +0x482e483d, +0x479f4897, +0x4749488b, +0x46a8489a, +0x488b46f2, +0x47e84891, +0x483d4872, +0x46fd4716, +0x46a049b5, +0x47a446e7, +0x476748a1, +0x49354939, +0x48c14703, +0x48bd4863, +0x48cf4913, +0x48b848b6, +0x49204946, +0x48e1495e, +0x48b24938, +0x4882493a, +0x49d5483b, +0x49724911, +0x49df496b, +0x488848f2, +0x48214a46, +0x490c48c1, +0x48a349b2, +0x47b0463a, +0x476244cb, +0x46b94765, +0x4814466a, +0x47964631, +0x474b4666, +0x47044798, +0x47614838, +0x459047d3, +0x48a245ea, +0x484447f1, +0x4776484b, +0x46d847d6, +0x44d348f3, +0x478d46fa, +0x466e481e, +0x481e4827, +0x479445a2, +0x48064727, +0x48d5475d, +0x48284708, +0x480d4862, +0x48324895, +0x47f148bd, +0x46a7482a, +0x492d47b1, +0x4884484d, +0x485f48dc, +0x480c476d, +0x46d348e9, +0x48844728, +0x480e48a0, +0x48134862, +0x485a4675, +0x473847e8, +0x48234836, +0x482146e7, +0x47b34822, +0x48554846, +0x47174863, +0x47c14872, +0x488e46d5, +0x485f47e2, +0x48b8487c, +0x4788481e, +0x467748bd, +0x47f846c9, +0x47fc48fe, +0x47b247a0, +0x467e4588, +0x46c74662, +0x481246e8, +0x474e4536, +0x468f46c0, +0x4679481f, +0x46e246a1, +0x45604809, +0x47eb4630, +0x475746b5, +0x477f4848, +0x46d846a6, +0x459a4870, +0x46784670, +0x468c47d2, +0x48c44762, +0x479146e3, +0x486d46b1, +0x486747d0, +0x47f6468d, +0x475648a5, +0x48544857, +0x48384866, +0x46ec484d, +0x48f647d2, +0x4879484a, +0x483c4848, +0x4806471d, +0x473048fa, +0x47b84768, +0x46f94865, +0x491848a8, +0x486746ca, +0x48624800, +0x491048d3, +0x4849474e, +0x486b48eb, +0x48c54966, +0x483048f4, +0x477848f9, +0x499e481e, +0x48f148cf, +0x49234982, +0x47cf487c, +0x464949ea, +0x495e4773, +0x483f48b2, +0x497548a7, +0x481e4616, +0x4866481f, +0x486448b6, +0x487347dc, +0x487f485c, +0x491f4938, +0x48b6490d, +0x48a148f8, +0x492d4859, +0x4915489c, +0x48874899, +0x4859486c, +0x471e49ca, +0x49184867, +0x482748d3, +0x4998488b, +0x481d4704, +0x488048b8, +0x49444876, +0x48f2470c, +0x489b48b9, +0x48e54956, +0x48a548d6, +0x485648dc, +0x49ab484e, +0x490e48e0, +0x494548dd, +0x48dd488b, +0x47ea4a32, +0x49114835, +0x48194965, +0x481e460e, +0x4673452c, +0x4717475c, +0x46d046f6, +0x46bc4696, +0x481e4726, +0x46ea4763, +0x475846fe, +0x4627478b, +0x483f4704, +0x47b146ad, +0x48164792, +0x468446f2, +0x45a84827, +0x47a4472f, +0x462b4797, +0x48ab483f, +0x4863468f, +0x4766485a, +0x48cb481d, +0x490347dc, +0x483048fc, +0x483e48cc, +0x486448ab, +0x47634966, +0x499d4794, +0x488b488e, +0x496048dc, +0x484c4854, +0x474c499c, +0x48bc4826, +0x48834949, +0x4905489d, +0x481e4718, +0x48f448e3, +0x490448c1, +0x48b347e8, +0x48d44892, +0x489448ff, +0x488648d5, +0x480348fa, +0x492e47d2, +0x48b24870, +0x492b48e5, +0x4785487b, +0x471d49e3, +0x48bf4837, +0x48c4489b, +0x4871475c, +0x4811464a, +0x471c47af, +0x48174817, +0x484e463b, +0x464f477f, +0x487c4704, +0x472547a3, +0x462a4853, +0x4860465a, +0x48804736, +0x482b47e1, +0x46c04811, +0x475d48dc, +0x48064668, +0x46f44893, +0x49594858, +0x487b463d, +0x484e480f, +0x48a648c0, +0x48944847, +0x484a48a0, +0x48f4491e, +0x48b548fc, +0x47d248ce, +0x497f47db, +0x49394955, +0x48ce48a7, +0x48844890, +0x476349d6, +0x4922486e, +0x48c348f4, +0x491c47ec, +0x47834698, +0x47544715, +0x47524745, +0x4832472f, +0x48094817, +0x48c347f8, +0x480047e6, +0x473048b6, +0x48cb480a, +0x488e479e, +0x488e47c2, +0x47ee472f, +0x4744489d, +0x48514755, +0x47d34846, +0x48a04838, +0x47624634, +0x48064786, +0x482d47e3, +0x486c4726, +0x480347b7, +0x481448ac, +0x483948e0, +0x47504827, +0x48c546f2, +0x4886483f, +0x485648ad, +0x47a947e8, +0x47434937, +0x481f46d0, +0x4804484c, +0x481f47fd, +0x4813456d, +0x4807474d, +0x480e4688, +0x481046e8, +0x4799469f, +0x478f4853, +0x482447f2, +0x471f47d0, +0x485f46da, +0x481c4813, +0x4863482e, +0x480b4786, +0x46b848c9, +0x46e2475a, +0x46c54852, +0x480245af, +0x46c24466, +0x4743465d, +0x47ba46b7, +0x46c34636, +0x47844677, +0x47c2485a, +0x46ac46dc, +0x460e47de, +0x4834465f, +0x476947f4, +0x481046fc, +0x45ea45fd, +0x45b548d0, +0x47834704, +0x46c44830, +0x47c74759, +0x45b0453d, +0x47024741, +0x47934736, +0x47ba461b, +0x46dd470b, +0x470b4657, +0x4710470d, +0x468f486c, +0x46ba45c3, +0x483b479d, +0x477446c9, +0x46a746a9, +0x46064833, +0x46a94690, +0x46a746f5, +0x48bb47ac, +0x4803452c, +0x4824470f, +0x48cb47d5, +0x484a4707, +0x47974832, +0x482c4851, +0x4877487a, +0x465d4891, +0x48ce47f4, +0x48994898, +0x486a484e, +0x47f047ac, +0x4611493e, +0x489e47e2, +0x46af488c, +0x48364665, +0x46b645e4, +0x46b946a1, +0x46dd46c8, +0x474b4658, +0x4777467b, +0x47984769, +0x475e4785, +0x4656472a, +0x488145fb, +0x472d46fc, +0x47a3476e, +0x46ca465d, +0x45004855, +0x479a464f, +0x473846c3, +0x486c481e, +0x48014659, +0x477a4756, +0x487b47d5, +0x48084706, +0x4838484f, +0x48634870, +0x480648d3, +0x47714865, +0x494c46be, +0x484c4915, +0x48624900, +0x46e8481a, +0x46a04974, +0x483d4775, +0x480e487c, +}; \ No newline at end of file diff --git a/hwpe/redmule/inc/tensor_dim.h b/hwpe/redmule/inc/tensor_dim.h new file mode 100644 index 0000000..21bd0d8 --- /dev/null +++ b/hwpe/redmule/inc/tensor_dim.h @@ -0,0 +1,13 @@ + /* Header file generated by RedMulE Golden Model */ +#ifndef __TENSOR_DIM__ +#define __TENSOR_DIM__ + +#define M_SIZE 24 +#define N_SIZE 32 +#define K_SIZE 32 +#define SRC_FMT FP16 +#define DST_FMT FP16 +#define FPFORMAT 16 +uint8_t gemm_ops = GEMM; + +#endif diff --git a/hwpe/redmule/inc/w_2D.h b/hwpe/redmule/inc/w_2D.h new file mode 100644 index 0000000..9409c64 --- /dev/null +++ b/hwpe/redmule/inc/w_2D.h @@ -0,0 +1,35 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t w_inp_2D [32][32] = { +0x311a, 0x39e0, 0x387d, 0x3a4a, 0x386f, 0x3ada, 0x392f, 0x3854, 0x3014, 0x2fd2, 0x31c9, 0x2fca, 0x2e55, 0x3bc8, 0x396d, 0x3b1d, 0x39f6, 0x333a, 0x3908, 0x3628, 0x3bab, 0x3b8b, 0x3b4a, 0x322d, 0x3925, 0x317a, 0x3725, 0x31c2, 0x3066, 0x38f3, 0x3a17, 0x3476, +0x3bda, 0x3196, 0x3922, 0x3680, 0x396a, 0x3021, 0x3761, 0x374d, 0x2fc2, 0x3967, 0x3b94, 0x33b5, 0x3797, 0x34d6, 0x3655, 0x2176, 0x39bc, 0x3999, 0x3658, 0x3904, 0x3759, 0x2ade, 0x3a5a, 0x3b78, 0x36c7, 0x2d01, 0x3b58, 0x2d9a, 0x373d, 0x3952, 0x38e8, 0x3887, +0x37b6, 0x3a88, 0x2f8a, 0x2d79, 0x3413, 0x3421, 0x3976, 0x32b2, 0x3446, 0x2d99, 0x3a56, 0x3322, 0x3b49, 0x39fa, 0x3acd, 0x3af6, 0x304c, 0x3abb, 0x3a83, 0x38b2, 0x3ab9, 0x363e, 0x389f, 0x31bb, 0x38e1, 0x3bc4, 0x3b9b, 0x2984, 0x3a43, 0x3b2f, 0x35d6, 0x3bda, +0x2df3, 0x3bf8, 0x2acc, 0x378b, 0x3555, 0x2e59, 0x31d4, 0x34ec, 0x3a46, 0x3bab, 0x3214, 0x3161, 0x3470, 0x3a03, 0x368e, 0x31ad, 0x27cb, 0x2ecb, 0x3422, 0x39f7, 0x3644, 0x3a77, 0x313f, 0x34f2, 0x39b3, 0x3bf2, 0x379a, 0x3456, 0x35fe, 0x3ae7, 0x3964, 0x385f, +0x3b16, 0x3999, 0x3833, 0x2eda, 0x3afd, 0x3a4a, 0x3ba2, 0x2bd4, 0x3b38, 0x31a2, 0x32dd, 0x353c, 0x366f, 0x375e, 0x3821, 0x367a, 0x3b44, 0x39e6, 0x3787, 0x339e, 0x39d7, 0x38c6, 0x37d5, 0x342f, 0x3984, 0x319b, 0x33b5, 0x35ab, 0x398a, 0x374e, 0x36b6, 0x3b21, +0x3bbb, 0x2ab3, 0x2ad5, 0x33bc, 0x2bef, 0x3780, 0x3738, 0x3a0b, 0x3b09, 0x30ca, 0x384e, 0x3ab3, 0x39bd, 0x3453, 0x3a6d, 0x3957, 0x2c10, 0x30e9, 0x35d4, 0x3aef, 0x3be9, 0x39ad, 0x3a74, 0x3af9, 0x3739, 0x2d4d, 0x39fe, 0x3b72, 0x2c57, 0x398c, 0x381f, 0x3930, +0x3820, 0x321b, 0x3964, 0x2964, 0x33a0, 0x2d00, 0x2490, 0x336b, 0x3465, 0x3b2e, 0x3aa0, 0x371f, 0x300e, 0x3a09, 0x3bf1, 0x25cc, 0x3b6f, 0x3384, 0x3a88, 0x3acb, 0x3814, 0x36d0, 0x3081, 0x3a2c, 0x3353, 0x39cb, 0x31ed, 0x3af6, 0x3721, 0x36c7, 0x2ce2, 0x390d, +0x3698, 0x3ab2, 0x3b3e, 0x2eb4, 0x3998, 0x39e3, 0x3a77, 0x3632, 0x2c12, 0x3bd5, 0x3ba3, 0x3bba, 0x323c, 0x367b, 0x3557, 0x39c8, 0x37db, 0x3b45, 0x3b6e, 0x3931, 0x3121, 0x3a8d, 0x3a55, 0x3b9b, 0x358a, 0x3925, 0x3491, 0x3912, 0x3b6b, 0x3584, 0x32df, 0x3120, +0x32b2, 0x3b0a, 0x2cad, 0x3465, 0x3ad3, 0x3bcd, 0x363b, 0x3afe, 0x354b, 0x3374, 0x39af, 0x3b7f, 0x308c, 0x2e72, 0x3380, 0x3b70, 0x3902, 0x38d8, 0x39f3, 0x3a4b, 0x3853, 0x397b, 0x2ebe, 0x387f, 0x2845, 0x37e2, 0x360f, 0x370b, 0x3acb, 0x35d4, 0x36e6, 0x3262, +0x2e88, 0x3a54, 0x2ee3, 0x3575, 0x3afe, 0x2aee, 0x39a0, 0x3aae, 0x3693, 0x3432, 0x3834, 0x3b9b, 0x3bcb, 0x2e3a, 0x356d, 0x374e, 0x3924, 0x383c, 0x311e, 0x3ac5, 0x352d, 0x311e, 0x38ca, 0x34d4, 0x36ca, 0x34ed, 0x3a13, 0x33eb, 0x3639, 0x3828, 0x3b3c, 0x3939, +0x3837, 0x3521, 0x2cb5, 0x3629, 0x3924, 0x384c, 0x366a, 0x3bbf, 0x2e9e, 0x3ba8, 0x33ad, 0x38c8, 0x3934, 0x3907, 0x249a, 0x3690, 0x3a09, 0x3215, 0x3898, 0x325d, 0x37d5, 0x3195, 0x361c, 0x3ae4, 0x351f, 0x3452, 0x3bc0, 0x375c, 0x39bf, 0x317a, 0x3aae, 0x283a, +0x3476, 0x3b92, 0x3472, 0x383e, 0x280f, 0x39d6, 0x2fd1, 0x31f4, 0x2ffb, 0x3b97, 0x3692, 0x36c0, 0x3989, 0x33cf, 0x3ba6, 0x3239, 0x35d7, 0x33ab, 0x31eb, 0x3b47, 0x389b, 0x3b88, 0x3580, 0x354c, 0x3802, 0x3b9a, 0x3b94, 0x2a92, 0x2db1, 0x38bd, 0x2dfb, 0x3900, +0x344f, 0x3739, 0x27a5, 0x3b2e, 0x342b, 0x34bb, 0x30c8, 0x3ae8, 0x3b26, 0x3982, 0x38c0, 0x3408, 0x38c8, 0x36ef, 0x3bf0, 0x3acf, 0x3a3c, 0x3825, 0x31a5, 0x3ada, 0x3b5b, 0x37db, 0x3a01, 0x3663, 0x3a7d, 0x327b, 0x3a1f, 0x3862, 0x38af, 0x3204, 0x372e, 0x3b19, +0x3708, 0x3622, 0x2e62, 0x39ab, 0x2d4d, 0x31b4, 0x3552, 0x3bbc, 0x36f2, 0x36eb, 0x38ef, 0x3755, 0x3bbe, 0x2c17, 0x3815, 0x2f53, 0x363f, 0x38c1, 0x3246, 0x386b, 0x34de, 0x34e4, 0x3baa, 0x349e, 0x32ce, 0x3a68, 0x373f, 0x2cce, 0x3b36, 0x28ba, 0x3b50, 0x3232, +0x1f34, 0x3928, 0x35cd, 0x3b38, 0x30ce, 0x35a1, 0x3a06, 0x3a32, 0x3a53, 0x3489, 0x3241, 0x372f, 0x390c, 0x3a1b, 0x378a, 0x3713, 0x3769, 0x37a8, 0x3418, 0x3ad4, 0x3a4e, 0x3bf7, 0x37a5, 0x34dc, 0x39b2, 0x351b, 0x3372, 0x349f, 0x2f50, 0x3ab1, 0x3795, 0x2db7, +0x3864, 0x3157, 0x3900, 0x323e, 0x389e, 0x3880, 0x3b1f, 0x37a1, 0x396c, 0x2e43, 0x2c2a, 0x3b78, 0x3988, 0x3a14, 0x39c1, 0x3b51, 0x3780, 0x3bf2, 0x2d19, 0x3815, 0x3a5f, 0x3641, 0x2f62, 0x37d5, 0x3564, 0x139a, 0x3ab8, 0x28f7, 0x3785, 0x34e1, 0x3097, 0x3768, +0x3971, 0x3ae2, 0x32ae, 0x2fd5, 0x382a, 0x346c, 0x3133, 0x3167, 0x3940, 0x2d12, 0x389a, 0x3bd0, 0x3943, 0x391c, 0x3a75, 0x2a11, 0x391e, 0x372d, 0x3a79, 0x3b72, 0x3373, 0x39b7, 0x35d7, 0x372b, 0x3a6d, 0x38a1, 0x3279, 0x3434, 0x3694, 0x3b45, 0x3abb, 0x392d, +0x34a8, 0x3757, 0x32ca, 0x345d, 0x36a5, 0x3854, 0x2dcd, 0x30af, 0x38dd, 0x3067, 0x3411, 0x3997, 0x397a, 0x3a64, 0x38b8, 0x3962, 0x3509, 0x3bb6, 0x3a66, 0x339f, 0x372a, 0x31a8, 0x37da, 0x36ff, 0x33c6, 0x31da, 0x3977, 0x3b72, 0x3841, 0x3567, 0x3433, 0x33b8, +0x39fe, 0x3a10, 0x3bf2, 0x35e7, 0x3a4a, 0x3b3e, 0x2ec7, 0x3aa4, 0x3846, 0x3af9, 0x38a9, 0x2c1f, 0x39ab, 0x349f, 0x31d6, 0x39ae, 0x3b79, 0x352d, 0x3516, 0x347c, 0x2f33, 0x35ad, 0x31c4, 0x3b52, 0x354b, 0x3786, 0x3ab7, 0x3896, 0x34ac, 0x352f, 0x37e6, 0x326a, +0x2e44, 0x34c7, 0x388d, 0x3bf4, 0x363f, 0x3b3d, 0x33b1, 0x3b8b, 0x3340, 0x37f7, 0x3b07, 0x25bf, 0x398e, 0x3505, 0x3bd7, 0x366d, 0x388a, 0x2cc0, 0x359a, 0x3b9a, 0x3b99, 0x379d, 0x3b6b, 0x39b8, 0x3223, 0x2703, 0x3ba9, 0x2ecb, 0x3759, 0x39d8, 0x37ac, 0x32cf, +0x35f2, 0x38a3, 0x399e, 0x3bd2, 0x3780, 0x3af3, 0x3b5e, 0x337b, 0x3a08, 0x35da, 0x3446, 0x3b25, 0x3ad0, 0x3bee, 0x3141, 0x32d8, 0x34ce, 0x2ac9, 0x3800, 0x3a8a, 0x2d53, 0x368a, 0x3561, 0x3998, 0x35a3, 0x3677, 0x3ab2, 0x3269, 0x3236, 0x3b3e, 0x3aba, 0x3bac, +0x395d, 0x3820, 0x1df6, 0x3bb5, 0x35b5, 0x3675, 0x3b74, 0x360f, 0x34de, 0x3a0c, 0x3aeb, 0x299d, 0x3207, 0x3bd8, 0x2178, 0x3995, 0x3948, 0x3908, 0x3843, 0x2ea5, 0x3045, 0x3989, 0x345d, 0x39c5, 0x3a89, 0x3863, 0x3be0, 0x397a, 0x38f1, 0x39e2, 0x3b08, 0x352e, +0x385f, 0x28f2, 0x3bc3, 0x35e0, 0x380c, 0x3b9c, 0x3afc, 0x390a, 0x3689, 0x34fd, 0x2cf5, 0x308e, 0x342b, 0x3921, 0x3a67, 0x3ad6, 0x2986, 0x32fc, 0x35aa, 0x3507, 0x3608, 0x33fd, 0x3bf3, 0x39e2, 0x3b0f, 0x30b7, 0x3896, 0x3ae4, 0x2145, 0x35b6, 0x2e1d, 0x3ad1, +0x333d, 0x3afb, 0x2703, 0x3413, 0x1d7d, 0x3b7f, 0x3ae1, 0x303c, 0x3004, 0x39d3, 0x3554, 0x31a4, 0x354e, 0x3662, 0x39c5, 0x2eb7, 0x2c6e, 0x397f, 0x31d8, 0x1f0c, 0x38e3, 0x35f0, 0x2714, 0x28d1, 0x375e, 0x3a75, 0x3830, 0x3578, 0x397d, 0x3b18, 0x383c, 0x3498, +0x39ad, 0x3598, 0x23c4, 0x34ea, 0x3a61, 0x2b00, 0x3707, 0x3ae1, 0x37ae, 0x389d, 0x37fa, 0x3673, 0x3278, 0xf3e, 0x3809, 0x33c6, 0x3bf5, 0x3279, 0x3816, 0x360c, 0x39c8, 0x381f, 0x3741, 0x2d66, 0x38c0, 0x37d3, 0x377a, 0x3621, 0x2faf, 0x392e, 0x2de6, 0x33c5, +0x3803, 0x2600, 0x32e9, 0x39b4, 0x38d2, 0x34e8, 0x2fe6, 0x3199, 0x3643, 0x3a77, 0x27cc, 0x39d7, 0x34c6, 0x2ea8, 0x364e, 0x3b07, 0x31c7, 0x30a1, 0x31b1, 0x3b8f, 0x3571, 0x3b75, 0x3989, 0x3805, 0x39fb, 0x3945, 0x352b, 0x31d8, 0x3904, 0x3440, 0x3a57, 0x2cf7, +0x3b39, 0x2fcd, 0x2b89, 0x2edd, 0x3682, 0x36a9, 0x32c8, 0x37ac, 0x32a5, 0x3311, 0x394b, 0x3b84, 0x3aec, 0x3601, 0x2765, 0x3b69, 0x396b, 0x3727, 0x3bfe, 0x3907, 0x376f, 0x3674, 0x3973, 0x3671, 0x3491, 0x3993, 0x383f, 0x3335, 0x3989, 0x3550, 0x3077, 0x35f5, +0x3a59, 0x3950, 0x380c, 0x37cd, 0x30bf, 0x3607, 0x3afa, 0x3b5d, 0x32b9, 0x386b, 0x35bd, 0x3aca, 0x3ba5, 0x3b2d, 0x3b19, 0x3b8b, 0x345e, 0x2845, 0x34aa, 0x372a, 0x3448, 0x34f5, 0x3ae2, 0x3637, 0x2cb5, 0x354b, 0x3b15, 0x2ca8, 0x2641, 0x3178, 0x2cfe, 0x39b4, +0x3bdd, 0x3acb, 0x3a05, 0x38a2, 0x3b4a, 0x34e5, 0x395f, 0x394b, 0x34c4, 0x3aa5, 0x29bb, 0x2d96, 0x339d, 0x387c, 0x382e, 0x385a, 0x396b, 0x3aa9, 0x2f1e, 0x33a7, 0x3b90, 0x3b7b, 0x3b5f, 0x39d3, 0x3b18, 0x354f, 0x2cdb, 0x3a6f, 0x3434, 0x34ff, 0x3a5b, 0x3b84, +0x3a33, 0x384b, 0x2e67, 0x3b85, 0x3853, 0x380c, 0x346a, 0x3aaa, 0x3492, 0x33e8, 0x3bf2, 0x38ae, 0x3a29, 0x3830, 0x3221, 0x35b1, 0x3a48, 0x2c68, 0x2ced, 0x3a7e, 0x3539, 0x3922, 0x374c, 0x3aaa, 0x2dae, 0x395d, 0x3b3d, 0x3890, 0x2cfe, 0x2dd6, 0x3bad, 0x33c5, +0x2c07, 0x3a2c, 0x37a8, 0x390f, 0x2fc8, 0x35ae, 0x388c, 0x30ee, 0x3674, 0x391d, 0x3bfc, 0x36bf, 0x322d, 0x3a78, 0x35c0, 0x3492, 0x3ac8, 0x3504, 0x3315, 0x381d, 0x3a7a, 0x3a08, 0x343c, 0x3bda, 0x341b, 0x39f0, 0x3b9e, 0x395d, 0x3c00, 0x38ab, 0x3bcf, 0x3564, +0x33c4, 0x3b0d, 0x3623, 0x33b9, 0x3b92, 0x1e71, 0x2c57, 0x36d0, 0x314b, 0x3a16, 0x3372, 0x341b, 0x3aaa, 0x3444, 0x396b, 0x2dd7, 0x3b30, 0x3559, 0x3b5b, 0x3a29, 0x2d19, 0x38b7, 0x3b01, 0x3afa, 0x398a, 0x3839, 0x3ac9, 0x2e31, 0x3924, 0x39f2, 0x3a7f, 0x3285 +}; \ No newline at end of file diff --git a/hwpe/redmule/inc/w_input.h b/hwpe/redmule/inc/w_input.h new file mode 100644 index 0000000..dc4d3be --- /dev/null +++ b/hwpe/redmule/inc/w_input.h @@ -0,0 +1,35 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t w_inp [1024] = { +0x311a, 0x39e0, 0x387d, 0x3a4a, 0x386f, 0x3ada, 0x392f, 0x3854, 0x3014, 0x2fd2, 0x31c9, 0x2fca, 0x2e55, 0x3bc8, 0x396d, 0x3b1d, 0x39f6, 0x333a, 0x3908, 0x3628, 0x3bab, 0x3b8b, 0x3b4a, 0x322d, 0x3925, 0x317a, 0x3725, 0x31c2, 0x3066, 0x38f3, 0x3a17, 0x3476, +0x3bda, 0x3196, 0x3922, 0x3680, 0x396a, 0x3021, 0x3761, 0x374d, 0x2fc2, 0x3967, 0x3b94, 0x33b5, 0x3797, 0x34d6, 0x3655, 0x2176, 0x39bc, 0x3999, 0x3658, 0x3904, 0x3759, 0x2ade, 0x3a5a, 0x3b78, 0x36c7, 0x2d01, 0x3b58, 0x2d9a, 0x373d, 0x3952, 0x38e8, 0x3887, +0x37b6, 0x3a88, 0x2f8a, 0x2d79, 0x3413, 0x3421, 0x3976, 0x32b2, 0x3446, 0x2d99, 0x3a56, 0x3322, 0x3b49, 0x39fa, 0x3acd, 0x3af6, 0x304c, 0x3abb, 0x3a83, 0x38b2, 0x3ab9, 0x363e, 0x389f, 0x31bb, 0x38e1, 0x3bc4, 0x3b9b, 0x2984, 0x3a43, 0x3b2f, 0x35d6, 0x3bda, +0x2df3, 0x3bf8, 0x2acc, 0x378b, 0x3555, 0x2e59, 0x31d4, 0x34ec, 0x3a46, 0x3bab, 0x3214, 0x3161, 0x3470, 0x3a03, 0x368e, 0x31ad, 0x27cb, 0x2ecb, 0x3422, 0x39f7, 0x3644, 0x3a77, 0x313f, 0x34f2, 0x39b3, 0x3bf2, 0x379a, 0x3456, 0x35fe, 0x3ae7, 0x3964, 0x385f, +0x3b16, 0x3999, 0x3833, 0x2eda, 0x3afd, 0x3a4a, 0x3ba2, 0x2bd4, 0x3b38, 0x31a2, 0x32dd, 0x353c, 0x366f, 0x375e, 0x3821, 0x367a, 0x3b44, 0x39e6, 0x3787, 0x339e, 0x39d7, 0x38c6, 0x37d5, 0x342f, 0x3984, 0x319b, 0x33b5, 0x35ab, 0x398a, 0x374e, 0x36b6, 0x3b21, +0x3bbb, 0x2ab3, 0x2ad5, 0x33bc, 0x2bef, 0x3780, 0x3738, 0x3a0b, 0x3b09, 0x30ca, 0x384e, 0x3ab3, 0x39bd, 0x3453, 0x3a6d, 0x3957, 0x2c10, 0x30e9, 0x35d4, 0x3aef, 0x3be9, 0x39ad, 0x3a74, 0x3af9, 0x3739, 0x2d4d, 0x39fe, 0x3b72, 0x2c57, 0x398c, 0x381f, 0x3930, +0x3820, 0x321b, 0x3964, 0x2964, 0x33a0, 0x2d00, 0x2490, 0x336b, 0x3465, 0x3b2e, 0x3aa0, 0x371f, 0x300e, 0x3a09, 0x3bf1, 0x25cc, 0x3b6f, 0x3384, 0x3a88, 0x3acb, 0x3814, 0x36d0, 0x3081, 0x3a2c, 0x3353, 0x39cb, 0x31ed, 0x3af6, 0x3721, 0x36c7, 0x2ce2, 0x390d, +0x3698, 0x3ab2, 0x3b3e, 0x2eb4, 0x3998, 0x39e3, 0x3a77, 0x3632, 0x2c12, 0x3bd5, 0x3ba3, 0x3bba, 0x323c, 0x367b, 0x3557, 0x39c8, 0x37db, 0x3b45, 0x3b6e, 0x3931, 0x3121, 0x3a8d, 0x3a55, 0x3b9b, 0x358a, 0x3925, 0x3491, 0x3912, 0x3b6b, 0x3584, 0x32df, 0x3120, +0x32b2, 0x3b0a, 0x2cad, 0x3465, 0x3ad3, 0x3bcd, 0x363b, 0x3afe, 0x354b, 0x3374, 0x39af, 0x3b7f, 0x308c, 0x2e72, 0x3380, 0x3b70, 0x3902, 0x38d8, 0x39f3, 0x3a4b, 0x3853, 0x397b, 0x2ebe, 0x387f, 0x2845, 0x37e2, 0x360f, 0x370b, 0x3acb, 0x35d4, 0x36e6, 0x3262, +0x2e88, 0x3a54, 0x2ee3, 0x3575, 0x3afe, 0x2aee, 0x39a0, 0x3aae, 0x3693, 0x3432, 0x3834, 0x3b9b, 0x3bcb, 0x2e3a, 0x356d, 0x374e, 0x3924, 0x383c, 0x311e, 0x3ac5, 0x352d, 0x311e, 0x38ca, 0x34d4, 0x36ca, 0x34ed, 0x3a13, 0x33eb, 0x3639, 0x3828, 0x3b3c, 0x3939, +0x3837, 0x3521, 0x2cb5, 0x3629, 0x3924, 0x384c, 0x366a, 0x3bbf, 0x2e9e, 0x3ba8, 0x33ad, 0x38c8, 0x3934, 0x3907, 0x249a, 0x3690, 0x3a09, 0x3215, 0x3898, 0x325d, 0x37d5, 0x3195, 0x361c, 0x3ae4, 0x351f, 0x3452, 0x3bc0, 0x375c, 0x39bf, 0x317a, 0x3aae, 0x283a, +0x3476, 0x3b92, 0x3472, 0x383e, 0x280f, 0x39d6, 0x2fd1, 0x31f4, 0x2ffb, 0x3b97, 0x3692, 0x36c0, 0x3989, 0x33cf, 0x3ba6, 0x3239, 0x35d7, 0x33ab, 0x31eb, 0x3b47, 0x389b, 0x3b88, 0x3580, 0x354c, 0x3802, 0x3b9a, 0x3b94, 0x2a92, 0x2db1, 0x38bd, 0x2dfb, 0x3900, +0x344f, 0x3739, 0x27a5, 0x3b2e, 0x342b, 0x34bb, 0x30c8, 0x3ae8, 0x3b26, 0x3982, 0x38c0, 0x3408, 0x38c8, 0x36ef, 0x3bf0, 0x3acf, 0x3a3c, 0x3825, 0x31a5, 0x3ada, 0x3b5b, 0x37db, 0x3a01, 0x3663, 0x3a7d, 0x327b, 0x3a1f, 0x3862, 0x38af, 0x3204, 0x372e, 0x3b19, +0x3708, 0x3622, 0x2e62, 0x39ab, 0x2d4d, 0x31b4, 0x3552, 0x3bbc, 0x36f2, 0x36eb, 0x38ef, 0x3755, 0x3bbe, 0x2c17, 0x3815, 0x2f53, 0x363f, 0x38c1, 0x3246, 0x386b, 0x34de, 0x34e4, 0x3baa, 0x349e, 0x32ce, 0x3a68, 0x373f, 0x2cce, 0x3b36, 0x28ba, 0x3b50, 0x3232, +0x1f34, 0x3928, 0x35cd, 0x3b38, 0x30ce, 0x35a1, 0x3a06, 0x3a32, 0x3a53, 0x3489, 0x3241, 0x372f, 0x390c, 0x3a1b, 0x378a, 0x3713, 0x3769, 0x37a8, 0x3418, 0x3ad4, 0x3a4e, 0x3bf7, 0x37a5, 0x34dc, 0x39b2, 0x351b, 0x3372, 0x349f, 0x2f50, 0x3ab1, 0x3795, 0x2db7, +0x3864, 0x3157, 0x3900, 0x323e, 0x389e, 0x3880, 0x3b1f, 0x37a1, 0x396c, 0x2e43, 0x2c2a, 0x3b78, 0x3988, 0x3a14, 0x39c1, 0x3b51, 0x3780, 0x3bf2, 0x2d19, 0x3815, 0x3a5f, 0x3641, 0x2f62, 0x37d5, 0x3564, 0x139a, 0x3ab8, 0x28f7, 0x3785, 0x34e1, 0x3097, 0x3768, +0x3971, 0x3ae2, 0x32ae, 0x2fd5, 0x382a, 0x346c, 0x3133, 0x3167, 0x3940, 0x2d12, 0x389a, 0x3bd0, 0x3943, 0x391c, 0x3a75, 0x2a11, 0x391e, 0x372d, 0x3a79, 0x3b72, 0x3373, 0x39b7, 0x35d7, 0x372b, 0x3a6d, 0x38a1, 0x3279, 0x3434, 0x3694, 0x3b45, 0x3abb, 0x392d, +0x34a8, 0x3757, 0x32ca, 0x345d, 0x36a5, 0x3854, 0x2dcd, 0x30af, 0x38dd, 0x3067, 0x3411, 0x3997, 0x397a, 0x3a64, 0x38b8, 0x3962, 0x3509, 0x3bb6, 0x3a66, 0x339f, 0x372a, 0x31a8, 0x37da, 0x36ff, 0x33c6, 0x31da, 0x3977, 0x3b72, 0x3841, 0x3567, 0x3433, 0x33b8, +0x39fe, 0x3a10, 0x3bf2, 0x35e7, 0x3a4a, 0x3b3e, 0x2ec7, 0x3aa4, 0x3846, 0x3af9, 0x38a9, 0x2c1f, 0x39ab, 0x349f, 0x31d6, 0x39ae, 0x3b79, 0x352d, 0x3516, 0x347c, 0x2f33, 0x35ad, 0x31c4, 0x3b52, 0x354b, 0x3786, 0x3ab7, 0x3896, 0x34ac, 0x352f, 0x37e6, 0x326a, +0x2e44, 0x34c7, 0x388d, 0x3bf4, 0x363f, 0x3b3d, 0x33b1, 0x3b8b, 0x3340, 0x37f7, 0x3b07, 0x25bf, 0x398e, 0x3505, 0x3bd7, 0x366d, 0x388a, 0x2cc0, 0x359a, 0x3b9a, 0x3b99, 0x379d, 0x3b6b, 0x39b8, 0x3223, 0x2703, 0x3ba9, 0x2ecb, 0x3759, 0x39d8, 0x37ac, 0x32cf, +0x35f2, 0x38a3, 0x399e, 0x3bd2, 0x3780, 0x3af3, 0x3b5e, 0x337b, 0x3a08, 0x35da, 0x3446, 0x3b25, 0x3ad0, 0x3bee, 0x3141, 0x32d8, 0x34ce, 0x2ac9, 0x3800, 0x3a8a, 0x2d53, 0x368a, 0x3561, 0x3998, 0x35a3, 0x3677, 0x3ab2, 0x3269, 0x3236, 0x3b3e, 0x3aba, 0x3bac, +0x395d, 0x3820, 0x1df6, 0x3bb5, 0x35b5, 0x3675, 0x3b74, 0x360f, 0x34de, 0x3a0c, 0x3aeb, 0x299d, 0x3207, 0x3bd8, 0x2178, 0x3995, 0x3948, 0x3908, 0x3843, 0x2ea5, 0x3045, 0x3989, 0x345d, 0x39c5, 0x3a89, 0x3863, 0x3be0, 0x397a, 0x38f1, 0x39e2, 0x3b08, 0x352e, +0x385f, 0x28f2, 0x3bc3, 0x35e0, 0x380c, 0x3b9c, 0x3afc, 0x390a, 0x3689, 0x34fd, 0x2cf5, 0x308e, 0x342b, 0x3921, 0x3a67, 0x3ad6, 0x2986, 0x32fc, 0x35aa, 0x3507, 0x3608, 0x33fd, 0x3bf3, 0x39e2, 0x3b0f, 0x30b7, 0x3896, 0x3ae4, 0x2145, 0x35b6, 0x2e1d, 0x3ad1, +0x333d, 0x3afb, 0x2703, 0x3413, 0x1d7d, 0x3b7f, 0x3ae1, 0x303c, 0x3004, 0x39d3, 0x3554, 0x31a4, 0x354e, 0x3662, 0x39c5, 0x2eb7, 0x2c6e, 0x397f, 0x31d8, 0x1f0c, 0x38e3, 0x35f0, 0x2714, 0x28d1, 0x375e, 0x3a75, 0x3830, 0x3578, 0x397d, 0x3b18, 0x383c, 0x3498, +0x39ad, 0x3598, 0x23c4, 0x34ea, 0x3a61, 0x2b00, 0x3707, 0x3ae1, 0x37ae, 0x389d, 0x37fa, 0x3673, 0x3278, 0xf3e, 0x3809, 0x33c6, 0x3bf5, 0x3279, 0x3816, 0x360c, 0x39c8, 0x381f, 0x3741, 0x2d66, 0x38c0, 0x37d3, 0x377a, 0x3621, 0x2faf, 0x392e, 0x2de6, 0x33c5, +0x3803, 0x2600, 0x32e9, 0x39b4, 0x38d2, 0x34e8, 0x2fe6, 0x3199, 0x3643, 0x3a77, 0x27cc, 0x39d7, 0x34c6, 0x2ea8, 0x364e, 0x3b07, 0x31c7, 0x30a1, 0x31b1, 0x3b8f, 0x3571, 0x3b75, 0x3989, 0x3805, 0x39fb, 0x3945, 0x352b, 0x31d8, 0x3904, 0x3440, 0x3a57, 0x2cf7, +0x3b39, 0x2fcd, 0x2b89, 0x2edd, 0x3682, 0x36a9, 0x32c8, 0x37ac, 0x32a5, 0x3311, 0x394b, 0x3b84, 0x3aec, 0x3601, 0x2765, 0x3b69, 0x396b, 0x3727, 0x3bfe, 0x3907, 0x376f, 0x3674, 0x3973, 0x3671, 0x3491, 0x3993, 0x383f, 0x3335, 0x3989, 0x3550, 0x3077, 0x35f5, +0x3a59, 0x3950, 0x380c, 0x37cd, 0x30bf, 0x3607, 0x3afa, 0x3b5d, 0x32b9, 0x386b, 0x35bd, 0x3aca, 0x3ba5, 0x3b2d, 0x3b19, 0x3b8b, 0x345e, 0x2845, 0x34aa, 0x372a, 0x3448, 0x34f5, 0x3ae2, 0x3637, 0x2cb5, 0x354b, 0x3b15, 0x2ca8, 0x2641, 0x3178, 0x2cfe, 0x39b4, +0x3bdd, 0x3acb, 0x3a05, 0x38a2, 0x3b4a, 0x34e5, 0x395f, 0x394b, 0x34c4, 0x3aa5, 0x29bb, 0x2d96, 0x339d, 0x387c, 0x382e, 0x385a, 0x396b, 0x3aa9, 0x2f1e, 0x33a7, 0x3b90, 0x3b7b, 0x3b5f, 0x39d3, 0x3b18, 0x354f, 0x2cdb, 0x3a6f, 0x3434, 0x34ff, 0x3a5b, 0x3b84, +0x3a33, 0x384b, 0x2e67, 0x3b85, 0x3853, 0x380c, 0x346a, 0x3aaa, 0x3492, 0x33e8, 0x3bf2, 0x38ae, 0x3a29, 0x3830, 0x3221, 0x35b1, 0x3a48, 0x2c68, 0x2ced, 0x3a7e, 0x3539, 0x3922, 0x374c, 0x3aaa, 0x2dae, 0x395d, 0x3b3d, 0x3890, 0x2cfe, 0x2dd6, 0x3bad, 0x33c5, +0x2c07, 0x3a2c, 0x37a8, 0x390f, 0x2fc8, 0x35ae, 0x388c, 0x30ee, 0x3674, 0x391d, 0x3bfc, 0x36bf, 0x322d, 0x3a78, 0x35c0, 0x3492, 0x3ac8, 0x3504, 0x3315, 0x381d, 0x3a7a, 0x3a08, 0x343c, 0x3bda, 0x341b, 0x39f0, 0x3b9e, 0x395d, 0x3c00, 0x38ab, 0x3bcf, 0x3564, +0x33c4, 0x3b0d, 0x3623, 0x33b9, 0x3b92, 0x1e71, 0x2c57, 0x36d0, 0x314b, 0x3a16, 0x3372, 0x341b, 0x3aaa, 0x3444, 0x396b, 0x2dd7, 0x3b30, 0x3559, 0x3b5b, 0x3a29, 0x2d19, 0x38b7, 0x3b01, 0x3afa, 0x398a, 0x3839, 0x3ac9, 0x2e31, 0x3924, 0x39f2, 0x3a7f, 0x3285 +}; \ No newline at end of file diff --git a/hwpe/redmule/inc/x_2D.h b/hwpe/redmule/inc/x_2D.h new file mode 100644 index 0000000..0b589f8 --- /dev/null +++ b/hwpe/redmule/inc/x_2D.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t x_inp_2D [24][32] = { +0x2153, 0x3bb5, 0x3896, 0x365f, 0x2483, 0x3518, 0x2dd1, 0x3bca, 0x397b, 0x29b1, 0x3705, 0x36c8, 0x398b, 0x3661, 0x2f05, 0x365a, 0x3bf9, 0x34df, 0x363b, 0x38d9, 0x39c6, 0x3abb, 0x3952, 0x38f2, 0x392d, 0x3b3e, 0x2afb, 0x3a9d, 0x353b, 0x3b73, 0x3a01, 0x3679, +0x3934, 0x397d, 0x2904, 0x3822, 0x3462, 0x3b44, 0x39e9, 0x28be, 0x331e, 0x3a1d, 0x39e5, 0x34da, 0x3a19, 0x3906, 0x1d35, 0x3871, 0x31e7, 0x3b29, 0x325d, 0x3797, 0x2b2f, 0x38b4, 0x232f, 0x38aa, 0x3aca, 0x316f, 0x3811, 0x3950, 0x32ea, 0x3bc7, 0x382c, 0x38a2, +0x29ce, 0x3afa, 0x3a39, 0x2ccc, 0x39fd, 0x3b3d, 0x384a, 0x3a35, 0x3802, 0x366a, 0x37ec, 0x3598, 0x3bf8, 0x3a85, 0x3a1b, 0x386e, 0x3b4c, 0x39de, 0x38c2, 0x2f93, 0x3b4c, 0x39c4, 0x3b9e, 0x3844, 0x346d, 0x3bff, 0x32ce, 0x296d, 0x3130, 0x3b3d, 0x3b44, 0x369d, +0x3b13, 0x31ed, 0x330a, 0x3831, 0x34e7, 0x37b3, 0x331a, 0x3918, 0x32d3, 0x3995, 0x3991, 0x3919, 0x3a26, 0x385b, 0x2b76, 0x3a3b, 0x37f2, 0x26a7, 0x3225, 0x3b64, 0x28f0, 0x3456, 0x3822, 0x341e, 0x381a, 0x38d8, 0x2c11, 0x33be, 0x33ac, 0x353f, 0x3476, 0x3abc, +0x36ec, 0x3a1d, 0x39d3, 0x3821, 0x36ac, 0x3bce, 0x3ad2, 0x3616, 0x36a1, 0x2cb3, 0x38d2, 0x314f, 0x385c, 0x3b63, 0x3bb6, 0x2951, 0x372d, 0x2c42, 0x3823, 0x3883, 0x3872, 0x31ee, 0x36c5, 0x399a, 0x31b0, 0x3887, 0x3884, 0x3865, 0x3896, 0x36c3, 0x32e3, 0x346c, +0x3935, 0x3b50, 0x2b6d, 0x38cd, 0x388f, 0x3389, 0x395d, 0x31cd, 0x2efd, 0x3154, 0x2f35, 0x3444, 0x3293, 0x3b6b, 0x1bec, 0x3b69, 0x3bf3, 0x3611, 0x3508, 0x3742, 0x3a50, 0x3ab7, 0x3457, 0x38d3, 0x3344, 0x38e8, 0x33c0, 0x3668, 0x3bee, 0x3b21, 0x3727, 0x3121, +0x316c, 0x3288, 0x2d50, 0x2e74, 0x35d5, 0x37e2, 0x303d, 0x36af, 0x341f, 0x3436, 0x2df7, 0x399d, 0x30f4, 0x3aaf, 0x34e4, 0x2c2a, 0x3116, 0x34d3, 0x36ac, 0x35e3, 0x3760, 0x36e1, 0x3ad2, 0x3547, 0x38f4, 0x369c, 0x3ba9, 0x34f0, 0x3a39, 0x3b19, 0x36e6, 0x395d, +0x3be8, 0x3293, 0x3bfc, 0x3435, 0x2eb3, 0x3360, 0x3919, 0x3bed, 0x396a, 0x37fc, 0x3242, 0x384b, 0x38cb, 0x3b2c, 0x3b28, 0x28cf, 0x3828, 0x3855, 0x3ba9, 0x2fa7, 0x340b, 0x32f1, 0x3ada, 0x36fa, 0x31f5, 0x3436, 0x29d0, 0x33e6, 0x3232, 0x3bec, 0x3904, 0x2797, +0x3b81, 0x3bac, 0x38d2, 0x343d, 0x31af, 0x3b1e, 0x33fc, 0x3864, 0x3624, 0x3905, 0x2945, 0x3b52, 0x2d08, 0x3a17, 0x3b84, 0x3804, 0x3a24, 0x38a3, 0x3562, 0x3ae6, 0x3bba, 0x3a45, 0x3679, 0x31fa, 0x3994, 0x2c3d, 0x383f, 0x399d, 0x34f7, 0x360e, 0x35f3, 0x38f0, +0x38d4, 0x399a, 0x3a48, 0x3987, 0x3b54, 0x382c, 0x3210, 0x35ef, 0x36ca, 0x31b4, 0x3625, 0x371f, 0x37bd, 0x3680, 0x3a3a, 0x3ac0, 0x3bbf, 0x3bf5, 0x39f2, 0x29c2, 0x363e, 0x3a4e, 0x3596, 0x3b1b, 0x3459, 0x3669, 0x3aa1, 0x39c3, 0x3376, 0x390d, 0x2456, 0x39b5, +0x3a66, 0x3ad8, 0x3b51, 0x36aa, 0x32be, 0x3ac8, 0x392b, 0x3740, 0x3a48, 0x38f5, 0x3b2d, 0x3a5f, 0x2ff3, 0x366f, 0x39d3, 0x35e5, 0x3822, 0x38db, 0x3b8a, 0x34be, 0x2d33, 0x36dd, 0x3578, 0x3bdf, 0x2c7e, 0x39cf, 0x32ff, 0x35c9, 0x3970, 0x3bcb, 0x351e, 0x3956, +0x2c42, 0x3308, 0x377a, 0x361c, 0x39a0, 0x36c9, 0x2dcb, 0x3bf2, 0x3b5f, 0x33ee, 0x24c1, 0x2ce9, 0x3927, 0x305d, 0x3702, 0x3119, 0x35f9, 0x3855, 0x3374, 0x349b, 0x3bcf, 0x2dea, 0x34f0, 0x363f, 0x37da, 0x3a74, 0x35fc, 0x35fa, 0x316b, 0x3804, 0x37a7, 0x3986, +0x3073, 0x3aed, 0x31c7, 0x3844, 0x34a4, 0x387d, 0x3a20, 0x3037, 0x3a00, 0x3b70, 0x377f, 0x3686, 0x3b7e, 0x38b3, 0x32e3, 0x3323, 0x391e, 0x3228, 0x3930, 0x3997, 0x3a5e, 0x398b, 0x3512, 0x35b0, 0x365c, 0x325d, 0x3b61, 0x38b8, 0x39a4, 0x3423, 0x3bd7, 0x38af, +0x2d3d, 0x382d, 0x38ac, 0x26ca, 0x395e, 0x21a8, 0x3520, 0x386f, 0x3b95, 0x32c0, 0x3b84, 0x3a51, 0x3b4b, 0x31d2, 0x3747, 0x3b96, 0x3b40, 0x3535, 0x38d1, 0x3899, 0x3b00, 0x3827, 0x3ae3, 0x38c8, 0x3a07, 0x338d, 0x2e96, 0x3a46, 0x394a, 0x39de, 0x2951, 0x3a02, +0x3838, 0x2d45, 0x28c0, 0x3958, 0x3070, 0x2aa2, 0x3510, 0x38ce, 0x271c, 0x3440, 0x3954, 0x30bc, 0x3b35, 0x2f1d, 0x3afb, 0x2dae, 0x356f, 0x2e13, 0x3981, 0x326d, 0x3a28, 0x3a36, 0x3a95, 0x38cb, 0x38db, 0x3150, 0x2c9e, 0x34c5, 0x3adb, 0x3bdf, 0x38f2, 0x3994, +0x36f8, 0x31c0, 0x3a4f, 0x3825, 0x394b, 0x3a8b, 0x38ac, 0x3167, 0x2e2d, 0x3a93, 0x34f3, 0x37bd, 0x3b63, 0x2f2f, 0x3ae0, 0x3ad8, 0x34a8, 0x2e1c, 0x3890, 0x3705, 0x3b69, 0x3bc1, 0x28af, 0x3b36, 0x348b, 0x3111, 0x3a8d, 0x389c, 0x3916, 0x36dc, 0x3bae, 0x3874, +0x3593, 0x3638, 0x3018, 0x3a56, 0x38a3, 0x2ad4, 0x3a25, 0x38d7, 0x3864, 0x31c1, 0x28d1, 0x39c8, 0x37d6, 0x2c7f, 0x3ba5, 0x34b8, 0x3bef, 0x3b83, 0x3ab5, 0x3062, 0x38bc, 0x399c, 0x2ce4, 0x2f2c, 0x39bf, 0x2ed1, 0x385f, 0x37e0, 0x35ee, 0x397d, 0x3b0c, 0x3049, +0x39d5, 0x322e, 0x3936, 0x3747, 0x2e15, 0x3b41, 0x3874, 0x3bd0, 0x2c04, 0x3800, 0x375b, 0x3b2d, 0x38d8, 0x3a51, 0x3406, 0x38da, 0x38ba, 0x3497, 0x382e, 0x35fc, 0x39d4, 0x3775, 0x3b1e, 0x3813, 0x3649, 0x31af, 0x37bb, 0x334a, 0x3a6e, 0x3284, 0x26e0, 0x2e01, +0x2ebb, 0x344b, 0x3821, 0x381a, 0x385a, 0x2534, 0x3635, 0x2a92, 0x3b8c, 0x31f0, 0x3947, 0x3ac7, 0x3743, 0x3924, 0x39e4, 0x358f, 0x2b62, 0x392c, 0x3955, 0x3341, 0x3676, 0x38ac, 0x3957, 0x335b, 0x2ca2, 0x39ff, 0x37cb, 0x341f, 0x3ac9, 0x3b6c, 0x2f14, 0x34c3, +0x3018, 0x3169, 0x355b, 0x3624, 0x31ed, 0x379e, 0x3268, 0x309b, 0x35db, 0x3872, 0x3bdb, 0x34c7, 0x3408, 0x3359, 0x3920, 0x331f, 0x3866, 0x3af0, 0x2a1a, 0x39e0, 0x3b14, 0x34fa, 0x2d18, 0x3963, 0x35e8, 0x2539, 0x38f5, 0x37b3, 0x378f, 0x31b5, 0x3a6c, 0x3685, +0x3a06, 0x318a, 0x2934, 0x33c1, 0x3be8, 0x375b, 0x3860, 0x3543, 0x3702, 0x3951, 0x3677, 0x37ff, 0x2e27, 0x2e3a, 0x340f, 0x3817, 0x2f04, 0x357e, 0x3a1d, 0x2dd6, 0x252a, 0x3945, 0x162a, 0x3b19, 0x3a53, 0x35d2, 0x3a5d, 0x3474, 0x38e9, 0x374b, 0x387c, 0x1f1a, +0x38ac, 0x3291, 0x3393, 0x3b53, 0x3169, 0x3bca, 0x2f1a, 0x3551, 0x38a3, 0x28e3, 0x369d, 0x34a1, 0x38a8, 0x34c3, 0x3841, 0x390d, 0x3b13, 0x3282, 0x3a29, 0x3a78, 0x2df3, 0x3a37, 0x35f4, 0x35a6, 0x38e8, 0x3328, 0x3beb, 0x390b, 0x32dc, 0x34dc, 0x396d, 0x3a78, +0x39ba, 0x3a06, 0x2cdd, 0x3bc3, 0x2d43, 0x2992, 0x3663, 0x3a68, 0x2c3e, 0x394e, 0x2c9f, 0x380e, 0x37f5, 0x3557, 0x2873, 0x390f, 0x39e7, 0x3939, 0x3669, 0x385c, 0x3a68, 0x32c4, 0x2b04, 0x2d6d, 0x39d3, 0x3895, 0x331d, 0x3b59, 0x3463, 0x2b6a, 0x31de, 0x3296, +0x3aae, 0x3bcd, 0x345a, 0x3897, 0x374b, 0x3bd4, 0x38a2, 0x357f, 0x3402, 0x3a0c, 0x3507, 0x3865, 0x3a54, 0x3878, 0x3859, 0x383e, 0x32b5, 0x34ea, 0x328d, 0x38b6, 0x3464, 0x2f5b, 0x35ff, 0x3817, 0x2f24, 0x3533, 0x3b21, 0x37ba, 0x3837, 0x2e34, 0x3bad, 0x34bc +}; \ No newline at end of file diff --git a/hwpe/redmule/inc/x_input.h b/hwpe/redmule/inc/x_input.h new file mode 100644 index 0000000..1e38d23 --- /dev/null +++ b/hwpe/redmule/inc/x_input.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t x_inp [768] = { +0x2153, 0x3bb5, 0x3896, 0x365f, 0x2483, 0x3518, 0x2dd1, 0x3bca, 0x397b, 0x29b1, 0x3705, 0x36c8, 0x398b, 0x3661, 0x2f05, 0x365a, 0x3bf9, 0x34df, 0x363b, 0x38d9, 0x39c6, 0x3abb, 0x3952, 0x38f2, 0x392d, 0x3b3e, 0x2afb, 0x3a9d, 0x353b, 0x3b73, 0x3a01, 0x3679, +0x3934, 0x397d, 0x2904, 0x3822, 0x3462, 0x3b44, 0x39e9, 0x28be, 0x331e, 0x3a1d, 0x39e5, 0x34da, 0x3a19, 0x3906, 0x1d35, 0x3871, 0x31e7, 0x3b29, 0x325d, 0x3797, 0x2b2f, 0x38b4, 0x232f, 0x38aa, 0x3aca, 0x316f, 0x3811, 0x3950, 0x32ea, 0x3bc7, 0x382c, 0x38a2, +0x29ce, 0x3afa, 0x3a39, 0x2ccc, 0x39fd, 0x3b3d, 0x384a, 0x3a35, 0x3802, 0x366a, 0x37ec, 0x3598, 0x3bf8, 0x3a85, 0x3a1b, 0x386e, 0x3b4c, 0x39de, 0x38c2, 0x2f93, 0x3b4c, 0x39c4, 0x3b9e, 0x3844, 0x346d, 0x3bff, 0x32ce, 0x296d, 0x3130, 0x3b3d, 0x3b44, 0x369d, +0x3b13, 0x31ed, 0x330a, 0x3831, 0x34e7, 0x37b3, 0x331a, 0x3918, 0x32d3, 0x3995, 0x3991, 0x3919, 0x3a26, 0x385b, 0x2b76, 0x3a3b, 0x37f2, 0x26a7, 0x3225, 0x3b64, 0x28f0, 0x3456, 0x3822, 0x341e, 0x381a, 0x38d8, 0x2c11, 0x33be, 0x33ac, 0x353f, 0x3476, 0x3abc, +0x36ec, 0x3a1d, 0x39d3, 0x3821, 0x36ac, 0x3bce, 0x3ad2, 0x3616, 0x36a1, 0x2cb3, 0x38d2, 0x314f, 0x385c, 0x3b63, 0x3bb6, 0x2951, 0x372d, 0x2c42, 0x3823, 0x3883, 0x3872, 0x31ee, 0x36c5, 0x399a, 0x31b0, 0x3887, 0x3884, 0x3865, 0x3896, 0x36c3, 0x32e3, 0x346c, +0x3935, 0x3b50, 0x2b6d, 0x38cd, 0x388f, 0x3389, 0x395d, 0x31cd, 0x2efd, 0x3154, 0x2f35, 0x3444, 0x3293, 0x3b6b, 0x1bec, 0x3b69, 0x3bf3, 0x3611, 0x3508, 0x3742, 0x3a50, 0x3ab7, 0x3457, 0x38d3, 0x3344, 0x38e8, 0x33c0, 0x3668, 0x3bee, 0x3b21, 0x3727, 0x3121, +0x316c, 0x3288, 0x2d50, 0x2e74, 0x35d5, 0x37e2, 0x303d, 0x36af, 0x341f, 0x3436, 0x2df7, 0x399d, 0x30f4, 0x3aaf, 0x34e4, 0x2c2a, 0x3116, 0x34d3, 0x36ac, 0x35e3, 0x3760, 0x36e1, 0x3ad2, 0x3547, 0x38f4, 0x369c, 0x3ba9, 0x34f0, 0x3a39, 0x3b19, 0x36e6, 0x395d, +0x3be8, 0x3293, 0x3bfc, 0x3435, 0x2eb3, 0x3360, 0x3919, 0x3bed, 0x396a, 0x37fc, 0x3242, 0x384b, 0x38cb, 0x3b2c, 0x3b28, 0x28cf, 0x3828, 0x3855, 0x3ba9, 0x2fa7, 0x340b, 0x32f1, 0x3ada, 0x36fa, 0x31f5, 0x3436, 0x29d0, 0x33e6, 0x3232, 0x3bec, 0x3904, 0x2797, +0x3b81, 0x3bac, 0x38d2, 0x343d, 0x31af, 0x3b1e, 0x33fc, 0x3864, 0x3624, 0x3905, 0x2945, 0x3b52, 0x2d08, 0x3a17, 0x3b84, 0x3804, 0x3a24, 0x38a3, 0x3562, 0x3ae6, 0x3bba, 0x3a45, 0x3679, 0x31fa, 0x3994, 0x2c3d, 0x383f, 0x399d, 0x34f7, 0x360e, 0x35f3, 0x38f0, +0x38d4, 0x399a, 0x3a48, 0x3987, 0x3b54, 0x382c, 0x3210, 0x35ef, 0x36ca, 0x31b4, 0x3625, 0x371f, 0x37bd, 0x3680, 0x3a3a, 0x3ac0, 0x3bbf, 0x3bf5, 0x39f2, 0x29c2, 0x363e, 0x3a4e, 0x3596, 0x3b1b, 0x3459, 0x3669, 0x3aa1, 0x39c3, 0x3376, 0x390d, 0x2456, 0x39b5, +0x3a66, 0x3ad8, 0x3b51, 0x36aa, 0x32be, 0x3ac8, 0x392b, 0x3740, 0x3a48, 0x38f5, 0x3b2d, 0x3a5f, 0x2ff3, 0x366f, 0x39d3, 0x35e5, 0x3822, 0x38db, 0x3b8a, 0x34be, 0x2d33, 0x36dd, 0x3578, 0x3bdf, 0x2c7e, 0x39cf, 0x32ff, 0x35c9, 0x3970, 0x3bcb, 0x351e, 0x3956, +0x2c42, 0x3308, 0x377a, 0x361c, 0x39a0, 0x36c9, 0x2dcb, 0x3bf2, 0x3b5f, 0x33ee, 0x24c1, 0x2ce9, 0x3927, 0x305d, 0x3702, 0x3119, 0x35f9, 0x3855, 0x3374, 0x349b, 0x3bcf, 0x2dea, 0x34f0, 0x363f, 0x37da, 0x3a74, 0x35fc, 0x35fa, 0x316b, 0x3804, 0x37a7, 0x3986, +0x3073, 0x3aed, 0x31c7, 0x3844, 0x34a4, 0x387d, 0x3a20, 0x3037, 0x3a00, 0x3b70, 0x377f, 0x3686, 0x3b7e, 0x38b3, 0x32e3, 0x3323, 0x391e, 0x3228, 0x3930, 0x3997, 0x3a5e, 0x398b, 0x3512, 0x35b0, 0x365c, 0x325d, 0x3b61, 0x38b8, 0x39a4, 0x3423, 0x3bd7, 0x38af, +0x2d3d, 0x382d, 0x38ac, 0x26ca, 0x395e, 0x21a8, 0x3520, 0x386f, 0x3b95, 0x32c0, 0x3b84, 0x3a51, 0x3b4b, 0x31d2, 0x3747, 0x3b96, 0x3b40, 0x3535, 0x38d1, 0x3899, 0x3b00, 0x3827, 0x3ae3, 0x38c8, 0x3a07, 0x338d, 0x2e96, 0x3a46, 0x394a, 0x39de, 0x2951, 0x3a02, +0x3838, 0x2d45, 0x28c0, 0x3958, 0x3070, 0x2aa2, 0x3510, 0x38ce, 0x271c, 0x3440, 0x3954, 0x30bc, 0x3b35, 0x2f1d, 0x3afb, 0x2dae, 0x356f, 0x2e13, 0x3981, 0x326d, 0x3a28, 0x3a36, 0x3a95, 0x38cb, 0x38db, 0x3150, 0x2c9e, 0x34c5, 0x3adb, 0x3bdf, 0x38f2, 0x3994, +0x36f8, 0x31c0, 0x3a4f, 0x3825, 0x394b, 0x3a8b, 0x38ac, 0x3167, 0x2e2d, 0x3a93, 0x34f3, 0x37bd, 0x3b63, 0x2f2f, 0x3ae0, 0x3ad8, 0x34a8, 0x2e1c, 0x3890, 0x3705, 0x3b69, 0x3bc1, 0x28af, 0x3b36, 0x348b, 0x3111, 0x3a8d, 0x389c, 0x3916, 0x36dc, 0x3bae, 0x3874, +0x3593, 0x3638, 0x3018, 0x3a56, 0x38a3, 0x2ad4, 0x3a25, 0x38d7, 0x3864, 0x31c1, 0x28d1, 0x39c8, 0x37d6, 0x2c7f, 0x3ba5, 0x34b8, 0x3bef, 0x3b83, 0x3ab5, 0x3062, 0x38bc, 0x399c, 0x2ce4, 0x2f2c, 0x39bf, 0x2ed1, 0x385f, 0x37e0, 0x35ee, 0x397d, 0x3b0c, 0x3049, +0x39d5, 0x322e, 0x3936, 0x3747, 0x2e15, 0x3b41, 0x3874, 0x3bd0, 0x2c04, 0x3800, 0x375b, 0x3b2d, 0x38d8, 0x3a51, 0x3406, 0x38da, 0x38ba, 0x3497, 0x382e, 0x35fc, 0x39d4, 0x3775, 0x3b1e, 0x3813, 0x3649, 0x31af, 0x37bb, 0x334a, 0x3a6e, 0x3284, 0x26e0, 0x2e01, +0x2ebb, 0x344b, 0x3821, 0x381a, 0x385a, 0x2534, 0x3635, 0x2a92, 0x3b8c, 0x31f0, 0x3947, 0x3ac7, 0x3743, 0x3924, 0x39e4, 0x358f, 0x2b62, 0x392c, 0x3955, 0x3341, 0x3676, 0x38ac, 0x3957, 0x335b, 0x2ca2, 0x39ff, 0x37cb, 0x341f, 0x3ac9, 0x3b6c, 0x2f14, 0x34c3, +0x3018, 0x3169, 0x355b, 0x3624, 0x31ed, 0x379e, 0x3268, 0x309b, 0x35db, 0x3872, 0x3bdb, 0x34c7, 0x3408, 0x3359, 0x3920, 0x331f, 0x3866, 0x3af0, 0x2a1a, 0x39e0, 0x3b14, 0x34fa, 0x2d18, 0x3963, 0x35e8, 0x2539, 0x38f5, 0x37b3, 0x378f, 0x31b5, 0x3a6c, 0x3685, +0x3a06, 0x318a, 0x2934, 0x33c1, 0x3be8, 0x375b, 0x3860, 0x3543, 0x3702, 0x3951, 0x3677, 0x37ff, 0x2e27, 0x2e3a, 0x340f, 0x3817, 0x2f04, 0x357e, 0x3a1d, 0x2dd6, 0x252a, 0x3945, 0x162a, 0x3b19, 0x3a53, 0x35d2, 0x3a5d, 0x3474, 0x38e9, 0x374b, 0x387c, 0x1f1a, +0x38ac, 0x3291, 0x3393, 0x3b53, 0x3169, 0x3bca, 0x2f1a, 0x3551, 0x38a3, 0x28e3, 0x369d, 0x34a1, 0x38a8, 0x34c3, 0x3841, 0x390d, 0x3b13, 0x3282, 0x3a29, 0x3a78, 0x2df3, 0x3a37, 0x35f4, 0x35a6, 0x38e8, 0x3328, 0x3beb, 0x390b, 0x32dc, 0x34dc, 0x396d, 0x3a78, +0x39ba, 0x3a06, 0x2cdd, 0x3bc3, 0x2d43, 0x2992, 0x3663, 0x3a68, 0x2c3e, 0x394e, 0x2c9f, 0x380e, 0x37f5, 0x3557, 0x2873, 0x390f, 0x39e7, 0x3939, 0x3669, 0x385c, 0x3a68, 0x32c4, 0x2b04, 0x2d6d, 0x39d3, 0x3895, 0x331d, 0x3b59, 0x3463, 0x2b6a, 0x31de, 0x3296, +0x3aae, 0x3bcd, 0x345a, 0x3897, 0x374b, 0x3bd4, 0x38a2, 0x357f, 0x3402, 0x3a0c, 0x3507, 0x3865, 0x3a54, 0x3878, 0x3859, 0x383e, 0x32b5, 0x34ea, 0x328d, 0x38b6, 0x3464, 0x2f5b, 0x35ff, 0x3817, 0x2f24, 0x3533, 0x3b21, 0x37ba, 0x3837, 0x2e34, 0x3bad, 0x34bc +}; \ No newline at end of file diff --git a/hwpe/redmule/inc/y_2D.h b/hwpe/redmule/inc/y_2D.h new file mode 100644 index 0000000..9484a10 --- /dev/null +++ b/hwpe/redmule/inc/y_2D.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t y_inp_2D [32][32] = { +0x3150, 0x2dc1, 0x3033, 0x31f5, 0x3bb6, 0x3bff, 0x39f9, 0x3662, 0x3720, 0x351d, 0x384b, 0x3093, 0x3b9d, 0x35ad, 0x3695, 0x3466, 0x2300, 0x3445, 0x33ae, 0x3586, 0x38a3, 0x3bdb, 0x33a2, 0x379b, 0x3a0e, 0x38b0, 0x39ba, 0x379b, 0x39d3, 0x3a51, 0x3b30, 0x3794, +0x3b76, 0x3042, 0x38cc, 0x2dfc, 0x3b1a, 0x37fb, 0x38f7, 0x3824, 0x386f, 0x38c7, 0x36ee, 0x3a9c, 0x38d3, 0x2c67, 0x3a80, 0x2f30, 0x3328, 0x3721, 0x3790, 0x34e5, 0x3a6c, 0x3643, 0x3934, 0x3034, 0x38d4, 0x362e, 0x3b4b, 0x3408, 0x30c2, 0x370e, 0x3b31, 0x3b16, +0x3b6b, 0x39d4, 0x339c, 0x381e, 0x313e, 0x3671, 0x3ae2, 0x3479, 0x3940, 0x342d, 0x3925, 0x370a, 0x35d8, 0x2dad, 0x3888, 0x24b9, 0x375d, 0x34bd, 0x3243, 0x2ebb, 0x3970, 0x3a21, 0x3a07, 0x3877, 0x3888, 0x3569, 0x372d, 0x2ac1, 0x331e, 0x384d, 0x3996, 0x34a4, +0x35c1, 0x33a9, 0x21ed, 0x3a42, 0x388d, 0x34e4, 0x33c3, 0x34f9, 0x3a7b, 0x33fb, 0x2cdd, 0x3b0e, 0x333b, 0x3973, 0x34fc, 0x3771, 0x32ea, 0x2de4, 0x31a8, 0x3946, 0x3657, 0x3a4e, 0x36f6, 0x2829, 0x3ba2, 0x3bdc, 0x3bb3, 0x306c, 0x398d, 0x3a1f, 0x3991, 0x3846, +0x3547, 0x3292, 0x2e85, 0x31ed, 0x3979, 0x3a90, 0x28a4, 0x3bed, 0x36d8, 0x340e, 0x3b6a, 0x3ab6, 0x3824, 0x382b, 0x3ac3, 0x3811, 0x36d7, 0x3519, 0x3a92, 0x3a42, 0x29d1, 0x383a, 0x3a9b, 0x300e, 0x2cd3, 0x39cd, 0x3874, 0x3a07, 0x2eb1, 0x3b86, 0x3ad8, 0x3a5d, +0x3712, 0x284a, 0x38c1, 0x3bec, 0x39c0, 0x32cd, 0x3ad8, 0x3bce, 0x3817, 0x3896, 0x3aa7, 0x3870, 0x3996, 0x32cc, 0x3a4c, 0x3757, 0x3814, 0x3b65, 0x3acb, 0x376e, 0x34c0, 0x3609, 0x3bf0, 0x3b24, 0x3b29, 0x3848, 0x34b7, 0x398a, 0x220c, 0x3498, 0x3a8c, 0x3883, +0x38c4, 0x3af6, 0x3a42, 0x2dd6, 0x3147, 0x3717, 0x3a8e, 0x3af9, 0x3296, 0x38ef, 0x34fa, 0x3555, 0x3b29, 0x38de, 0x315e, 0x3773, 0x3b67, 0x3116, 0x38ec, 0x357c, 0x35d0, 0x2518, 0x3958, 0x2a03, 0x37d9, 0x3699, 0x3a1e, 0x3230, 0x3b13, 0x36d4, 0x3b2a, 0x39ad, +0x3b10, 0x351a, 0x3b97, 0x3326, 0x2b54, 0x3b7d, 0x386f, 0x373e, 0x37fa, 0x389b, 0x3b90, 0x3292, 0x3975, 0x38f3, 0x37f1, 0x3590, 0x3810, 0x2fd7, 0x3bf7, 0x3a5a, 0x3a1c, 0x34dd, 0x354c, 0x32f8, 0x3095, 0x321e, 0x39e0, 0x395c, 0x3717, 0x357f, 0x394a, 0x34b1, +0x3ba4, 0x380c, 0x3604, 0x2f50, 0x348d, 0x3828, 0x3a9f, 0x39ce, 0x32ca, 0x3906, 0x3ab2, 0x2ca5, 0x38c9, 0x362a, 0x34b2, 0x29dc, 0x3a36, 0x3052, 0x31b7, 0x3589, 0x387c, 0x3401, 0x3b22, 0x3ad6, 0x3ae8, 0x3238, 0x3494, 0x3502, 0x3717, 0x3a6c, 0x3229, 0x368c, +0x3056, 0x3a56, 0x3498, 0x39eb, 0x2864, 0x342d, 0x39e0, 0x34a1, 0x2b99, 0x3a04, 0x38ff, 0x328c, 0x34d9, 0x387d, 0x3a3c, 0x32e5, 0x39eb, 0x3984, 0x34dd, 0x38a7, 0x373f, 0x39b4, 0x3235, 0x2f58, 0x2f39, 0x3800, 0x3758, 0x3939, 0x39fc, 0x3a4b, 0x38bf, 0x30ee, +0x345e, 0x39c8, 0x3a6d, 0x3262, 0x3b81, 0x31dc, 0x3a15, 0x3bd0, 0x36af, 0x36de, 0x37d5, 0x39d7, 0x3ad3, 0x3ac1, 0x3109, 0x35ea, 0x31c6, 0x398d, 0x3987, 0x3a4a, 0x34d2, 0x2ed2, 0x35e6, 0x352c, 0x39eb, 0x3bd6, 0x3a5b, 0x39d1, 0x34aa, 0x3ade, 0x394b, 0x38a1, +0x2bed, 0x38de, 0x3811, 0x3813, 0x391a, 0x374b, 0x3829, 0x3725, 0x38f0, 0x3583, 0x3966, 0x3a7d, 0x375a, 0x38fe, 0x3696, 0x361c, 0x39a8, 0x35f0, 0x38e1, 0x3003, 0x3595, 0x316e, 0x3862, 0x3af8, 0x3af2, 0x34c8, 0x381d, 0x37d8, 0x3893, 0x3a9c, 0x3989, 0x308c, +0x30cc, 0x2538, 0x399d, 0x3919, 0x399e, 0x21cc, 0x38e9, 0x30f8, 0x3a20, 0x3b3c, 0x3990, 0x259c, 0x3143, 0x3080, 0x3967, 0x3afb, 0x3a1b, 0x3779, 0x2eeb, 0x39f3, 0x379a, 0x369c, 0x3985, 0x3a1b, 0x3ba6, 0x3a53, 0x28d5, 0x3881, 0x31d9, 0x3a34, 0x3bd9, 0x393a, +0x3601, 0x2c6e, 0x3636, 0x3298, 0x39bb, 0x3a08, 0x38db, 0x35ad, 0x3a09, 0x36a6, 0x3bc7, 0x3bac, 0x34ae, 0x3291, 0x290b, 0x3250, 0x2648, 0x333d, 0x2bf3, 0x34b1, 0x30e0, 0x351f, 0x3a74, 0x38dc, 0x3883, 0x2841, 0x35e1, 0x390d, 0x3a50, 0x3abd, 0x386d, 0x3bb7, +0x3b94, 0x36b7, 0x3a49, 0x332f, 0x3a1d, 0x354b, 0x3bab, 0x3346, 0x3417, 0x351e, 0x3b6d, 0x391a, 0x2db3, 0x3b1c, 0x3a4a, 0x37b7, 0x36cf, 0x3a56, 0x39c4, 0x3be9, 0x34f0, 0x39be, 0x3691, 0x1ba5, 0x3888, 0x3040, 0x3ae1, 0x3b9b, 0x398f, 0x3a49, 0x3a16, 0x38c0, +0x386c, 0x39ab, 0x37fa, 0x382c, 0x3a6f, 0x393f, 0x340d, 0x38ef, 0x39d1, 0x3845, 0x398f, 0x363e, 0x3687, 0x3052, 0x3a2b, 0x392c, 0x2f5c, 0x3412, 0x3a1f, 0x3b2f, 0x3bcc, 0x3a63, 0x3a89, 0x36e9, 0x3921, 0x3b80, 0x2dc0, 0x3a03, 0x3beb, 0x38d3, 0x36cb, 0x39a3, +0x3978, 0x3a88, 0x3ba4, 0x3561, 0x28c5, 0x33a0, 0x37be, 0x2c39, 0x30ee, 0x3782, 0x2c07, 0x354e, 0x3491, 0x3a92, 0x331a, 0x3b15, 0x32e1, 0x3839, 0x3afb, 0x36c2, 0x2fd0, 0x29ad, 0x3b2e, 0x39c1, 0x2a8c, 0x341a, 0x2f90, 0x395a, 0x3969, 0x37ea, 0x3a5c, 0x3b6d, +0x3971, 0x3a93, 0x304e, 0x3623, 0x3a22, 0x31ee, 0x29df, 0x2c93, 0x3a01, 0x3a62, 0x366c, 0x371d, 0x3af3, 0x2e08, 0x3ac0, 0x3642, 0x3a28, 0x368d, 0x2d3d, 0x36d9, 0x32c3, 0x373f, 0x36fe, 0x3487, 0x2c81, 0x3623, 0x3b59, 0x3a91, 0x350a, 0x34f4, 0x3b09, 0x2c25, +0x3b13, 0x325a, 0x379e, 0x3a7d, 0x34b1, 0x39d5, 0x2ba8, 0x322b, 0x3b5e, 0x37ab, 0x2e24, 0x3ba9, 0x3a3d, 0x34f7, 0x3ba1, 0x3877, 0x3071, 0x39fb, 0x3bbd, 0x3633, 0x3b36, 0x2daa, 0x3b9b, 0x3aa0, 0x395c, 0x3b8f, 0x38d5, 0x3ab0, 0x3a8f, 0x36c2, 0x3b1f, 0x3489, +0x2acc, 0x3845, 0x3715, 0x37d8, 0x3992, 0x3bff, 0x350e, 0x3ad7, 0x39b0, 0x35ac, 0x3287, 0x385f, 0x3bd4, 0x37a3, 0x3438, 0x39a5, 0x3bcf, 0x38c3, 0x34f6, 0x3ae3, 0x3b57, 0x39af, 0x35eb, 0x3bed, 0x34d4, 0x2a95, 0x3b13, 0x384e, 0x3a3b, 0x33da, 0x3bce, 0x3b99, +0x3559, 0x3335, 0x3a2e, 0x3123, 0x38db, 0x33d0, 0x3638, 0x3b17, 0x3a72, 0x3afc, 0x3936, 0x3838, 0x2b69, 0x3895, 0x3a1a, 0x3192, 0x39d5, 0x37a5, 0x2eb0, 0x2e8b, 0x329a, 0x3b90, 0x390a, 0x3a1e, 0x3847, 0x375d, 0x3873, 0x35e2, 0x3771, 0x30f5, 0x3231, 0x3bd7, +0x2bbc, 0x3ace, 0x31ad, 0x3a6b, 0x28a4, 0x3b48, 0x3ba3, 0x3a84, 0x3353, 0x39f6, 0x381f, 0x2dd6, 0x314c, 0x34af, 0x3929, 0x3921, 0x383b, 0x34b0, 0x3923, 0x32c9, 0x3ae7, 0x318f, 0x3480, 0x2ad8, 0x3042, 0x3a4c, 0x349d, 0x2c12, 0x3abb, 0x3a57, 0x3b0d, 0x3111, +0x3359, 0x3a84, 0x38f2, 0x368d, 0x2f4b, 0x3ba0, 0x395c, 0x3026, 0x3a15, 0x2a04, 0x326e, 0x3522, 0x31a2, 0x382f, 0x2ada, 0x3b7c, 0x2f80, 0x3af5, 0x2d35, 0x38fa, 0x39ab, 0x2c6d, 0x2e7a, 0x39f6, 0x31a4, 0x3a53, 0x358c, 0x3951, 0x3a4e, 0x3916, 0x2a3f, 0x3ae9, +0x3b03, 0x39f8, 0x39fe, 0x3a61, 0x39fb, 0x3704, 0x360d, 0x39a7, 0x37a9, 0x348f, 0x3a30, 0x3af5, 0x366f, 0x3b29, 0x3a6a, 0x33d5, 0x370a, 0x39cd, 0x3444, 0x3bea, 0x3b2b, 0x312e, 0x3b8e, 0x32cf, 0x3b79, 0x3302, 0x3bba, 0x3962, 0x3413, 0x37a1, 0x39e0, 0x3805 +}; \ No newline at end of file diff --git a/hwpe/redmule/inc/y_input.h b/hwpe/redmule/inc/y_input.h new file mode 100644 index 0000000..45a2375 --- /dev/null +++ b/hwpe/redmule/inc/y_input.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t y_inp [768] = { +0x3150, 0x2dc1, 0x3033, 0x31f5, 0x3bb6, 0x3bff, 0x39f9, 0x3662, 0x3720, 0x351d, 0x384b, 0x3093, 0x3b9d, 0x35ad, 0x3695, 0x3466, 0x2300, 0x3445, 0x33ae, 0x3586, 0x38a3, 0x3bdb, 0x33a2, 0x379b, 0x3a0e, 0x38b0, 0x39ba, 0x379b, 0x39d3, 0x3a51, 0x3b30, 0x3794, +0x3b76, 0x3042, 0x38cc, 0x2dfc, 0x3b1a, 0x37fb, 0x38f7, 0x3824, 0x386f, 0x38c7, 0x36ee, 0x3a9c, 0x38d3, 0x2c67, 0x3a80, 0x2f30, 0x3328, 0x3721, 0x3790, 0x34e5, 0x3a6c, 0x3643, 0x3934, 0x3034, 0x38d4, 0x362e, 0x3b4b, 0x3408, 0x30c2, 0x370e, 0x3b31, 0x3b16, +0x3b6b, 0x39d4, 0x339c, 0x381e, 0x313e, 0x3671, 0x3ae2, 0x3479, 0x3940, 0x342d, 0x3925, 0x370a, 0x35d8, 0x2dad, 0x3888, 0x24b9, 0x375d, 0x34bd, 0x3243, 0x2ebb, 0x3970, 0x3a21, 0x3a07, 0x3877, 0x3888, 0x3569, 0x372d, 0x2ac1, 0x331e, 0x384d, 0x3996, 0x34a4, +0x35c1, 0x33a9, 0x21ed, 0x3a42, 0x388d, 0x34e4, 0x33c3, 0x34f9, 0x3a7b, 0x33fb, 0x2cdd, 0x3b0e, 0x333b, 0x3973, 0x34fc, 0x3771, 0x32ea, 0x2de4, 0x31a8, 0x3946, 0x3657, 0x3a4e, 0x36f6, 0x2829, 0x3ba2, 0x3bdc, 0x3bb3, 0x306c, 0x398d, 0x3a1f, 0x3991, 0x3846, +0x3547, 0x3292, 0x2e85, 0x31ed, 0x3979, 0x3a90, 0x28a4, 0x3bed, 0x36d8, 0x340e, 0x3b6a, 0x3ab6, 0x3824, 0x382b, 0x3ac3, 0x3811, 0x36d7, 0x3519, 0x3a92, 0x3a42, 0x29d1, 0x383a, 0x3a9b, 0x300e, 0x2cd3, 0x39cd, 0x3874, 0x3a07, 0x2eb1, 0x3b86, 0x3ad8, 0x3a5d, +0x3712, 0x284a, 0x38c1, 0x3bec, 0x39c0, 0x32cd, 0x3ad8, 0x3bce, 0x3817, 0x3896, 0x3aa7, 0x3870, 0x3996, 0x32cc, 0x3a4c, 0x3757, 0x3814, 0x3b65, 0x3acb, 0x376e, 0x34c0, 0x3609, 0x3bf0, 0x3b24, 0x3b29, 0x3848, 0x34b7, 0x398a, 0x220c, 0x3498, 0x3a8c, 0x3883, +0x38c4, 0x3af6, 0x3a42, 0x2dd6, 0x3147, 0x3717, 0x3a8e, 0x3af9, 0x3296, 0x38ef, 0x34fa, 0x3555, 0x3b29, 0x38de, 0x315e, 0x3773, 0x3b67, 0x3116, 0x38ec, 0x357c, 0x35d0, 0x2518, 0x3958, 0x2a03, 0x37d9, 0x3699, 0x3a1e, 0x3230, 0x3b13, 0x36d4, 0x3b2a, 0x39ad, +0x3b10, 0x351a, 0x3b97, 0x3326, 0x2b54, 0x3b7d, 0x386f, 0x373e, 0x37fa, 0x389b, 0x3b90, 0x3292, 0x3975, 0x38f3, 0x37f1, 0x3590, 0x3810, 0x2fd7, 0x3bf7, 0x3a5a, 0x3a1c, 0x34dd, 0x354c, 0x32f8, 0x3095, 0x321e, 0x39e0, 0x395c, 0x3717, 0x357f, 0x394a, 0x34b1, +0x3ba4, 0x380c, 0x3604, 0x2f50, 0x348d, 0x3828, 0x3a9f, 0x39ce, 0x32ca, 0x3906, 0x3ab2, 0x2ca5, 0x38c9, 0x362a, 0x34b2, 0x29dc, 0x3a36, 0x3052, 0x31b7, 0x3589, 0x387c, 0x3401, 0x3b22, 0x3ad6, 0x3ae8, 0x3238, 0x3494, 0x3502, 0x3717, 0x3a6c, 0x3229, 0x368c, +0x3056, 0x3a56, 0x3498, 0x39eb, 0x2864, 0x342d, 0x39e0, 0x34a1, 0x2b99, 0x3a04, 0x38ff, 0x328c, 0x34d9, 0x387d, 0x3a3c, 0x32e5, 0x39eb, 0x3984, 0x34dd, 0x38a7, 0x373f, 0x39b4, 0x3235, 0x2f58, 0x2f39, 0x3800, 0x3758, 0x3939, 0x39fc, 0x3a4b, 0x38bf, 0x30ee, +0x345e, 0x39c8, 0x3a6d, 0x3262, 0x3b81, 0x31dc, 0x3a15, 0x3bd0, 0x36af, 0x36de, 0x37d5, 0x39d7, 0x3ad3, 0x3ac1, 0x3109, 0x35ea, 0x31c6, 0x398d, 0x3987, 0x3a4a, 0x34d2, 0x2ed2, 0x35e6, 0x352c, 0x39eb, 0x3bd6, 0x3a5b, 0x39d1, 0x34aa, 0x3ade, 0x394b, 0x38a1, +0x2bed, 0x38de, 0x3811, 0x3813, 0x391a, 0x374b, 0x3829, 0x3725, 0x38f0, 0x3583, 0x3966, 0x3a7d, 0x375a, 0x38fe, 0x3696, 0x361c, 0x39a8, 0x35f0, 0x38e1, 0x3003, 0x3595, 0x316e, 0x3862, 0x3af8, 0x3af2, 0x34c8, 0x381d, 0x37d8, 0x3893, 0x3a9c, 0x3989, 0x308c, +0x30cc, 0x2538, 0x399d, 0x3919, 0x399e, 0x21cc, 0x38e9, 0x30f8, 0x3a20, 0x3b3c, 0x3990, 0x259c, 0x3143, 0x3080, 0x3967, 0x3afb, 0x3a1b, 0x3779, 0x2eeb, 0x39f3, 0x379a, 0x369c, 0x3985, 0x3a1b, 0x3ba6, 0x3a53, 0x28d5, 0x3881, 0x31d9, 0x3a34, 0x3bd9, 0x393a, +0x3601, 0x2c6e, 0x3636, 0x3298, 0x39bb, 0x3a08, 0x38db, 0x35ad, 0x3a09, 0x36a6, 0x3bc7, 0x3bac, 0x34ae, 0x3291, 0x290b, 0x3250, 0x2648, 0x333d, 0x2bf3, 0x34b1, 0x30e0, 0x351f, 0x3a74, 0x38dc, 0x3883, 0x2841, 0x35e1, 0x390d, 0x3a50, 0x3abd, 0x386d, 0x3bb7, +0x3b94, 0x36b7, 0x3a49, 0x332f, 0x3a1d, 0x354b, 0x3bab, 0x3346, 0x3417, 0x351e, 0x3b6d, 0x391a, 0x2db3, 0x3b1c, 0x3a4a, 0x37b7, 0x36cf, 0x3a56, 0x39c4, 0x3be9, 0x34f0, 0x39be, 0x3691, 0x1ba5, 0x3888, 0x3040, 0x3ae1, 0x3b9b, 0x398f, 0x3a49, 0x3a16, 0x38c0, +0x386c, 0x39ab, 0x37fa, 0x382c, 0x3a6f, 0x393f, 0x340d, 0x38ef, 0x39d1, 0x3845, 0x398f, 0x363e, 0x3687, 0x3052, 0x3a2b, 0x392c, 0x2f5c, 0x3412, 0x3a1f, 0x3b2f, 0x3bcc, 0x3a63, 0x3a89, 0x36e9, 0x3921, 0x3b80, 0x2dc0, 0x3a03, 0x3beb, 0x38d3, 0x36cb, 0x39a3, +0x3978, 0x3a88, 0x3ba4, 0x3561, 0x28c5, 0x33a0, 0x37be, 0x2c39, 0x30ee, 0x3782, 0x2c07, 0x354e, 0x3491, 0x3a92, 0x331a, 0x3b15, 0x32e1, 0x3839, 0x3afb, 0x36c2, 0x2fd0, 0x29ad, 0x3b2e, 0x39c1, 0x2a8c, 0x341a, 0x2f90, 0x395a, 0x3969, 0x37ea, 0x3a5c, 0x3b6d, +0x3971, 0x3a93, 0x304e, 0x3623, 0x3a22, 0x31ee, 0x29df, 0x2c93, 0x3a01, 0x3a62, 0x366c, 0x371d, 0x3af3, 0x2e08, 0x3ac0, 0x3642, 0x3a28, 0x368d, 0x2d3d, 0x36d9, 0x32c3, 0x373f, 0x36fe, 0x3487, 0x2c81, 0x3623, 0x3b59, 0x3a91, 0x350a, 0x34f4, 0x3b09, 0x2c25, +0x3b13, 0x325a, 0x379e, 0x3a7d, 0x34b1, 0x39d5, 0x2ba8, 0x322b, 0x3b5e, 0x37ab, 0x2e24, 0x3ba9, 0x3a3d, 0x34f7, 0x3ba1, 0x3877, 0x3071, 0x39fb, 0x3bbd, 0x3633, 0x3b36, 0x2daa, 0x3b9b, 0x3aa0, 0x395c, 0x3b8f, 0x38d5, 0x3ab0, 0x3a8f, 0x36c2, 0x3b1f, 0x3489, +0x2acc, 0x3845, 0x3715, 0x37d8, 0x3992, 0x3bff, 0x350e, 0x3ad7, 0x39b0, 0x35ac, 0x3287, 0x385f, 0x3bd4, 0x37a3, 0x3438, 0x39a5, 0x3bcf, 0x38c3, 0x34f6, 0x3ae3, 0x3b57, 0x39af, 0x35eb, 0x3bed, 0x34d4, 0x2a95, 0x3b13, 0x384e, 0x3a3b, 0x33da, 0x3bce, 0x3b99, +0x3559, 0x3335, 0x3a2e, 0x3123, 0x38db, 0x33d0, 0x3638, 0x3b17, 0x3a72, 0x3afc, 0x3936, 0x3838, 0x2b69, 0x3895, 0x3a1a, 0x3192, 0x39d5, 0x37a5, 0x2eb0, 0x2e8b, 0x329a, 0x3b90, 0x390a, 0x3a1e, 0x3847, 0x375d, 0x3873, 0x35e2, 0x3771, 0x30f5, 0x3231, 0x3bd7, +0x2bbc, 0x3ace, 0x31ad, 0x3a6b, 0x28a4, 0x3b48, 0x3ba3, 0x3a84, 0x3353, 0x39f6, 0x381f, 0x2dd6, 0x314c, 0x34af, 0x3929, 0x3921, 0x383b, 0x34b0, 0x3923, 0x32c9, 0x3ae7, 0x318f, 0x3480, 0x2ad8, 0x3042, 0x3a4c, 0x349d, 0x2c12, 0x3abb, 0x3a57, 0x3b0d, 0x3111, +0x3359, 0x3a84, 0x38f2, 0x368d, 0x2f4b, 0x3ba0, 0x395c, 0x3026, 0x3a15, 0x2a04, 0x326e, 0x3522, 0x31a2, 0x382f, 0x2ada, 0x3b7c, 0x2f80, 0x3af5, 0x2d35, 0x38fa, 0x39ab, 0x2c6d, 0x2e7a, 0x39f6, 0x31a4, 0x3a53, 0x358c, 0x3951, 0x3a4e, 0x3916, 0x2a3f, 0x3ae9, +0x3b03, 0x39f8, 0x39fe, 0x3a61, 0x39fb, 0x3704, 0x360d, 0x39a7, 0x37a9, 0x348f, 0x3a30, 0x3af5, 0x366f, 0x3b29, 0x3a6a, 0x33d5, 0x370a, 0x39cd, 0x3444, 0x3bea, 0x3b2b, 0x312e, 0x3b8e, 0x32cf, 0x3b79, 0x3302, 0x3bba, 0x3962, 0x3413, 0x37a1, 0x39e0, 0x3805 +}; \ No newline at end of file diff --git a/hwpe/redmule/inc/z_2D.h b/hwpe/redmule/inc/z_2D.h new file mode 100644 index 0000000..aff808a --- /dev/null +++ b/hwpe/redmule/inc/z_2D.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t z_oup_2D [24][32] = { +0x4845, 0x4897, 0x4608, 0x4838, 0x4855, 0x487b, 0x4869, 0x4880, 0x46d1, 0x48b0, 0x48db, 0x483f, 0x48c9, 0x485f, 0x4881, 0x483a, 0x484b, 0x472c, 0x4762, 0x492b, 0x4822, 0x48fd, 0x488e, 0x492e, 0x483e, 0x484f, 0x49e8, 0x46d7, 0x484b, 0x489d, 0x490b, 0x47e9, +0x484f, 0x47d2, 0x44be, 0x4747, 0x47c7, 0x46c0, 0x4727, 0x48af, 0x46c5, 0x482d, 0x483d, 0x482e, 0x4897, 0x479f, 0x488b, 0x4749, 0x489a, 0x46a8, 0x46f2, 0x488b, 0x4891, 0x47e8, 0x4872, 0x483d, 0x4716, 0x46fd, 0x49b5, 0x46a0, 0x46e7, 0x47a4, 0x48a1, 0x4767, +0x4939, 0x4935, 0x4703, 0x48c1, 0x4863, 0x48bd, 0x4913, 0x48cf, 0x48b6, 0x48b8, 0x4946, 0x4920, 0x495e, 0x48e1, 0x4938, 0x48b2, 0x493a, 0x4882, 0x483b, 0x49d5, 0x4911, 0x4972, 0x496b, 0x49df, 0x48f2, 0x4888, 0x4a46, 0x4821, 0x48c1, 0x490c, 0x49b2, 0x48a3, +0x463a, 0x47b0, 0x44cb, 0x4762, 0x4765, 0x46b9, 0x466a, 0x4814, 0x4631, 0x4796, 0x4666, 0x474b, 0x4798, 0x4704, 0x4838, 0x4761, 0x47d3, 0x4590, 0x45ea, 0x48a2, 0x47f1, 0x4844, 0x484b, 0x4776, 0x47d6, 0x46d8, 0x48f3, 0x44d3, 0x46fa, 0x478d, 0x481e, 0x466e, +0x4827, 0x481e, 0x45a2, 0x4794, 0x4727, 0x4806, 0x475d, 0x48d5, 0x4708, 0x4828, 0x4862, 0x480d, 0x4895, 0x4832, 0x48bd, 0x47f1, 0x482a, 0x46a7, 0x47b1, 0x492d, 0x484d, 0x4884, 0x48dc, 0x485f, 0x476d, 0x480c, 0x48e9, 0x46d3, 0x4728, 0x4884, 0x48a0, 0x480e, +0x4862, 0x4813, 0x4675, 0x485a, 0x47e8, 0x4738, 0x4836, 0x4823, 0x46e7, 0x4821, 0x4822, 0x47b3, 0x4846, 0x4855, 0x4863, 0x4717, 0x4872, 0x47c1, 0x46d5, 0x488e, 0x47e2, 0x485f, 0x487c, 0x48b8, 0x481e, 0x4788, 0x48bd, 0x4677, 0x46c9, 0x47f8, 0x48fe, 0x47fc, +0x47a0, 0x47b2, 0x4588, 0x467e, 0x4662, 0x46c7, 0x46e8, 0x4812, 0x4536, 0x474e, 0x46c0, 0x468f, 0x481f, 0x4679, 0x46a1, 0x46e2, 0x4809, 0x4560, 0x4630, 0x47eb, 0x46b5, 0x4757, 0x4848, 0x477f, 0x46a6, 0x46d8, 0x4870, 0x459a, 0x4670, 0x4678, 0x47d2, 0x468c, +0x4762, 0x48c4, 0x46e3, 0x4791, 0x46b1, 0x486d, 0x47d0, 0x4867, 0x468d, 0x47f6, 0x48a5, 0x4756, 0x4857, 0x4854, 0x4866, 0x4838, 0x484d, 0x46ec, 0x47d2, 0x48f6, 0x484a, 0x4879, 0x4848, 0x483c, 0x471d, 0x4806, 0x48fa, 0x4730, 0x4768, 0x47b8, 0x4865, 0x46f9, +0x48a8, 0x4918, 0x46ca, 0x4867, 0x4800, 0x4862, 0x48d3, 0x4910, 0x474e, 0x4849, 0x48eb, 0x486b, 0x4966, 0x48c5, 0x48f4, 0x4830, 0x48f9, 0x4778, 0x481e, 0x499e, 0x48cf, 0x48f1, 0x4982, 0x4923, 0x487c, 0x47cf, 0x49ea, 0x4649, 0x4773, 0x495e, 0x48b2, 0x483f, +0x48a7, 0x4975, 0x4616, 0x481e, 0x481f, 0x4866, 0x48b6, 0x4864, 0x47dc, 0x4873, 0x485c, 0x487f, 0x4938, 0x491f, 0x490d, 0x48b6, 0x48f8, 0x48a1, 0x4859, 0x492d, 0x489c, 0x4915, 0x4899, 0x4887, 0x486c, 0x4859, 0x49ca, 0x471e, 0x4867, 0x4918, 0x48d3, 0x4827, +0x488b, 0x4998, 0x4704, 0x481d, 0x48b8, 0x4880, 0x4876, 0x4944, 0x470c, 0x48f2, 0x48b9, 0x489b, 0x4956, 0x48e5, 0x48d6, 0x48a5, 0x48dc, 0x4856, 0x484e, 0x49ab, 0x48e0, 0x490e, 0x48dd, 0x4945, 0x488b, 0x48dd, 0x4a32, 0x47ea, 0x4835, 0x4911, 0x4965, 0x4819, +0x460e, 0x481e, 0x452c, 0x4673, 0x475c, 0x4717, 0x46f6, 0x46d0, 0x4696, 0x46bc, 0x4726, 0x481e, 0x4763, 0x46ea, 0x46fe, 0x4758, 0x478b, 0x4627, 0x4704, 0x483f, 0x46ad, 0x47b1, 0x4792, 0x4816, 0x46f2, 0x4684, 0x4827, 0x45a8, 0x472f, 0x47a4, 0x4797, 0x462b, +0x483f, 0x48ab, 0x468f, 0x4863, 0x485a, 0x4766, 0x481d, 0x48cb, 0x47dc, 0x4903, 0x48fc, 0x4830, 0x48cc, 0x483e, 0x48ab, 0x4864, 0x4966, 0x4763, 0x4794, 0x499d, 0x488e, 0x488b, 0x48dc, 0x4960, 0x4854, 0x484c, 0x499c, 0x474c, 0x4826, 0x48bc, 0x4949, 0x4883, +0x489d, 0x4905, 0x4718, 0x481e, 0x48e3, 0x48f4, 0x48c1, 0x4904, 0x47e8, 0x48b3, 0x4892, 0x48d4, 0x48ff, 0x4894, 0x48d5, 0x4886, 0x48fa, 0x4803, 0x47d2, 0x492e, 0x4870, 0x48b2, 0x48e5, 0x492b, 0x487b, 0x4785, 0x49e3, 0x471d, 0x4837, 0x48bf, 0x489b, 0x48c4, +0x475c, 0x4871, 0x464a, 0x4811, 0x47af, 0x471c, 0x4817, 0x4817, 0x463b, 0x484e, 0x477f, 0x464f, 0x4704, 0x487c, 0x47a3, 0x4725, 0x4853, 0x462a, 0x465a, 0x4860, 0x4736, 0x4880, 0x47e1, 0x482b, 0x4811, 0x46c0, 0x48dc, 0x475d, 0x4668, 0x4806, 0x4893, 0x46f4, +0x4858, 0x4959, 0x463d, 0x487b, 0x480f, 0x484e, 0x48c0, 0x48a6, 0x4847, 0x4894, 0x48a0, 0x484a, 0x491e, 0x48f4, 0x48fc, 0x48b5, 0x48ce, 0x47d2, 0x47db, 0x497f, 0x4955, 0x4939, 0x48a7, 0x48ce, 0x4890, 0x4884, 0x49d6, 0x4763, 0x486e, 0x4922, 0x48f4, 0x48c3, +0x47ec, 0x491c, 0x4698, 0x4783, 0x4715, 0x4754, 0x4745, 0x4752, 0x472f, 0x4832, 0x4817, 0x4809, 0x47f8, 0x48c3, 0x47e6, 0x4800, 0x48b6, 0x4730, 0x480a, 0x48cb, 0x479e, 0x488e, 0x47c2, 0x488e, 0x472f, 0x47ee, 0x489d, 0x4744, 0x4755, 0x4851, 0x4846, 0x47d3, +0x4838, 0x48a0, 0x4634, 0x4762, 0x4786, 0x4806, 0x47e3, 0x482d, 0x4726, 0x486c, 0x47b7, 0x4803, 0x48ac, 0x4814, 0x48e0, 0x4839, 0x4827, 0x4750, 0x46f2, 0x48c5, 0x483f, 0x4886, 0x48ad, 0x4856, 0x47e8, 0x47a9, 0x4937, 0x4743, 0x46d0, 0x481f, 0x484c, 0x4804, +0x47fd, 0x481f, 0x456d, 0x4813, 0x474d, 0x4807, 0x4688, 0x480e, 0x46e8, 0x4810, 0x469f, 0x4799, 0x4853, 0x478f, 0x47f2, 0x4824, 0x47d0, 0x471f, 0x46da, 0x485f, 0x4813, 0x481c, 0x482e, 0x4863, 0x4786, 0x480b, 0x48c9, 0x46b8, 0x475a, 0x46e2, 0x4852, 0x46c5, +0x45af, 0x4802, 0x4466, 0x46c2, 0x465d, 0x4743, 0x46b7, 0x47ba, 0x4636, 0x46c3, 0x4677, 0x4784, 0x485a, 0x47c2, 0x46dc, 0x46ac, 0x47de, 0x460e, 0x465f, 0x4834, 0x47f4, 0x4769, 0x46fc, 0x4810, 0x45fd, 0x45ea, 0x48d0, 0x45b5, 0x4704, 0x4783, 0x4830, 0x46c4, +0x4759, 0x47c7, 0x453d, 0x45b0, 0x4741, 0x4702, 0x4736, 0x4793, 0x461b, 0x47ba, 0x470b, 0x46dd, 0x4657, 0x470b, 0x470d, 0x4710, 0x486c, 0x468f, 0x45c3, 0x46ba, 0x479d, 0x483b, 0x46c9, 0x4774, 0x46a9, 0x46a7, 0x4833, 0x4606, 0x4690, 0x46a9, 0x46f5, 0x46a7, +0x47ac, 0x48bb, 0x452c, 0x4803, 0x470f, 0x4824, 0x47d5, 0x48cb, 0x4707, 0x484a, 0x4832, 0x4797, 0x4851, 0x482c, 0x487a, 0x4877, 0x4891, 0x465d, 0x47f4, 0x48ce, 0x4898, 0x4899, 0x484e, 0x486a, 0x47ac, 0x47f0, 0x493e, 0x4611, 0x47e2, 0x489e, 0x488c, 0x46af, +0x4665, 0x4836, 0x45e4, 0x46b6, 0x46a1, 0x46b9, 0x46c8, 0x46dd, 0x4658, 0x474b, 0x467b, 0x4777, 0x4769, 0x4798, 0x4785, 0x475e, 0x472a, 0x4656, 0x45fb, 0x4881, 0x46fc, 0x472d, 0x476e, 0x47a3, 0x465d, 0x46ca, 0x4855, 0x4500, 0x464f, 0x479a, 0x46c3, 0x4738, +0x481e, 0x486c, 0x4659, 0x4801, 0x4756, 0x477a, 0x47d5, 0x487b, 0x4706, 0x4808, 0x484f, 0x4838, 0x4870, 0x4863, 0x48d3, 0x4806, 0x4865, 0x4771, 0x46be, 0x494c, 0x4915, 0x484c, 0x4900, 0x4862, 0x481a, 0x46e8, 0x4974, 0x46a0, 0x4775, 0x483d, 0x487c, 0x480e +}; \ No newline at end of file diff --git a/hwpe/redmule/inc/z_output.h b/hwpe/redmule/inc/z_output.h new file mode 100644 index 0000000..96c7e5f --- /dev/null +++ b/hwpe/redmule/inc/z_output.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t z_oup [768] = { +0x4845, 0x4897, 0x4608, 0x4838, 0x4855, 0x487b, 0x4869, 0x4880, 0x46d1, 0x48b0, 0x48db, 0x483f, 0x48c9, 0x485f, 0x4881, 0x483a, 0x484b, 0x472c, 0x4762, 0x492b, 0x4822, 0x48fd, 0x488e, 0x492e, 0x483e, 0x484f, 0x49e8, 0x46d7, 0x484b, 0x489d, 0x490b, 0x47e9, +0x484f, 0x47d2, 0x44be, 0x4747, 0x47c7, 0x46c0, 0x4727, 0x48af, 0x46c5, 0x482d, 0x483d, 0x482e, 0x4897, 0x479f, 0x488b, 0x4749, 0x489a, 0x46a8, 0x46f2, 0x488b, 0x4891, 0x47e8, 0x4872, 0x483d, 0x4716, 0x46fd, 0x49b5, 0x46a0, 0x46e7, 0x47a4, 0x48a1, 0x4767, +0x4939, 0x4935, 0x4703, 0x48c1, 0x4863, 0x48bd, 0x4913, 0x48cf, 0x48b6, 0x48b8, 0x4946, 0x4920, 0x495e, 0x48e1, 0x4938, 0x48b2, 0x493a, 0x4882, 0x483b, 0x49d5, 0x4911, 0x4972, 0x496b, 0x49df, 0x48f2, 0x4888, 0x4a46, 0x4821, 0x48c1, 0x490c, 0x49b2, 0x48a3, +0x463a, 0x47b0, 0x44cb, 0x4762, 0x4765, 0x46b9, 0x466a, 0x4814, 0x4631, 0x4796, 0x4666, 0x474b, 0x4798, 0x4704, 0x4838, 0x4761, 0x47d3, 0x4590, 0x45ea, 0x48a2, 0x47f1, 0x4844, 0x484b, 0x4776, 0x47d6, 0x46d8, 0x48f3, 0x44d3, 0x46fa, 0x478d, 0x481e, 0x466e, +0x4827, 0x481e, 0x45a2, 0x4794, 0x4727, 0x4806, 0x475d, 0x48d5, 0x4708, 0x4828, 0x4862, 0x480d, 0x4895, 0x4832, 0x48bd, 0x47f1, 0x482a, 0x46a7, 0x47b1, 0x492d, 0x484d, 0x4884, 0x48dc, 0x485f, 0x476d, 0x480c, 0x48e9, 0x46d3, 0x4728, 0x4884, 0x48a0, 0x480e, +0x4862, 0x4813, 0x4675, 0x485a, 0x47e8, 0x4738, 0x4836, 0x4823, 0x46e7, 0x4821, 0x4822, 0x47b3, 0x4846, 0x4855, 0x4863, 0x4717, 0x4872, 0x47c1, 0x46d5, 0x488e, 0x47e2, 0x485f, 0x487c, 0x48b8, 0x481e, 0x4788, 0x48bd, 0x4677, 0x46c9, 0x47f8, 0x48fe, 0x47fc, +0x47a0, 0x47b2, 0x4588, 0x467e, 0x4662, 0x46c7, 0x46e8, 0x4812, 0x4536, 0x474e, 0x46c0, 0x468f, 0x481f, 0x4679, 0x46a1, 0x46e2, 0x4809, 0x4560, 0x4630, 0x47eb, 0x46b5, 0x4757, 0x4848, 0x477f, 0x46a6, 0x46d8, 0x4870, 0x459a, 0x4670, 0x4678, 0x47d2, 0x468c, +0x4762, 0x48c4, 0x46e3, 0x4791, 0x46b1, 0x486d, 0x47d0, 0x4867, 0x468d, 0x47f6, 0x48a5, 0x4756, 0x4857, 0x4854, 0x4866, 0x4838, 0x484d, 0x46ec, 0x47d2, 0x48f6, 0x484a, 0x4879, 0x4848, 0x483c, 0x471d, 0x4806, 0x48fa, 0x4730, 0x4768, 0x47b8, 0x4865, 0x46f9, +0x48a8, 0x4918, 0x46ca, 0x4867, 0x4800, 0x4862, 0x48d3, 0x4910, 0x474e, 0x4849, 0x48eb, 0x486b, 0x4966, 0x48c5, 0x48f4, 0x4830, 0x48f9, 0x4778, 0x481e, 0x499e, 0x48cf, 0x48f1, 0x4982, 0x4923, 0x487c, 0x47cf, 0x49ea, 0x4649, 0x4773, 0x495e, 0x48b2, 0x483f, +0x48a7, 0x4975, 0x4616, 0x481e, 0x481f, 0x4866, 0x48b6, 0x4864, 0x47dc, 0x4873, 0x485c, 0x487f, 0x4938, 0x491f, 0x490d, 0x48b6, 0x48f8, 0x48a1, 0x4859, 0x492d, 0x489c, 0x4915, 0x4899, 0x4887, 0x486c, 0x4859, 0x49ca, 0x471e, 0x4867, 0x4918, 0x48d3, 0x4827, +0x488b, 0x4998, 0x4704, 0x481d, 0x48b8, 0x4880, 0x4876, 0x4944, 0x470c, 0x48f2, 0x48b9, 0x489b, 0x4956, 0x48e5, 0x48d6, 0x48a5, 0x48dc, 0x4856, 0x484e, 0x49ab, 0x48e0, 0x490e, 0x48dd, 0x4945, 0x488b, 0x48dd, 0x4a32, 0x47ea, 0x4835, 0x4911, 0x4965, 0x4819, +0x460e, 0x481e, 0x452c, 0x4673, 0x475c, 0x4717, 0x46f6, 0x46d0, 0x4696, 0x46bc, 0x4726, 0x481e, 0x4763, 0x46ea, 0x46fe, 0x4758, 0x478b, 0x4627, 0x4704, 0x483f, 0x46ad, 0x47b1, 0x4792, 0x4816, 0x46f2, 0x4684, 0x4827, 0x45a8, 0x472f, 0x47a4, 0x4797, 0x462b, +0x483f, 0x48ab, 0x468f, 0x4863, 0x485a, 0x4766, 0x481d, 0x48cb, 0x47dc, 0x4903, 0x48fc, 0x4830, 0x48cc, 0x483e, 0x48ab, 0x4864, 0x4966, 0x4763, 0x4794, 0x499d, 0x488e, 0x488b, 0x48dc, 0x4960, 0x4854, 0x484c, 0x499c, 0x474c, 0x4826, 0x48bc, 0x4949, 0x4883, +0x489d, 0x4905, 0x4718, 0x481e, 0x48e3, 0x48f4, 0x48c1, 0x4904, 0x47e8, 0x48b3, 0x4892, 0x48d4, 0x48ff, 0x4894, 0x48d5, 0x4886, 0x48fa, 0x4803, 0x47d2, 0x492e, 0x4870, 0x48b2, 0x48e5, 0x492b, 0x487b, 0x4785, 0x49e3, 0x471d, 0x4837, 0x48bf, 0x489b, 0x48c4, +0x475c, 0x4871, 0x464a, 0x4811, 0x47af, 0x471c, 0x4817, 0x4817, 0x463b, 0x484e, 0x477f, 0x464f, 0x4704, 0x487c, 0x47a3, 0x4725, 0x4853, 0x462a, 0x465a, 0x4860, 0x4736, 0x4880, 0x47e1, 0x482b, 0x4811, 0x46c0, 0x48dc, 0x475d, 0x4668, 0x4806, 0x4893, 0x46f4, +0x4858, 0x4959, 0x463d, 0x487b, 0x480f, 0x484e, 0x48c0, 0x48a6, 0x4847, 0x4894, 0x48a0, 0x484a, 0x491e, 0x48f4, 0x48fc, 0x48b5, 0x48ce, 0x47d2, 0x47db, 0x497f, 0x4955, 0x4939, 0x48a7, 0x48ce, 0x4890, 0x4884, 0x49d6, 0x4763, 0x486e, 0x4922, 0x48f4, 0x48c3, +0x47ec, 0x491c, 0x4698, 0x4783, 0x4715, 0x4754, 0x4745, 0x4752, 0x472f, 0x4832, 0x4817, 0x4809, 0x47f8, 0x48c3, 0x47e6, 0x4800, 0x48b6, 0x4730, 0x480a, 0x48cb, 0x479e, 0x488e, 0x47c2, 0x488e, 0x472f, 0x47ee, 0x489d, 0x4744, 0x4755, 0x4851, 0x4846, 0x47d3, +0x4838, 0x48a0, 0x4634, 0x4762, 0x4786, 0x4806, 0x47e3, 0x482d, 0x4726, 0x486c, 0x47b7, 0x4803, 0x48ac, 0x4814, 0x48e0, 0x4839, 0x4827, 0x4750, 0x46f2, 0x48c5, 0x483f, 0x4886, 0x48ad, 0x4856, 0x47e8, 0x47a9, 0x4937, 0x4743, 0x46d0, 0x481f, 0x484c, 0x4804, +0x47fd, 0x481f, 0x456d, 0x4813, 0x474d, 0x4807, 0x4688, 0x480e, 0x46e8, 0x4810, 0x469f, 0x4799, 0x4853, 0x478f, 0x47f2, 0x4824, 0x47d0, 0x471f, 0x46da, 0x485f, 0x4813, 0x481c, 0x482e, 0x4863, 0x4786, 0x480b, 0x48c9, 0x46b8, 0x475a, 0x46e2, 0x4852, 0x46c5, +0x45af, 0x4802, 0x4466, 0x46c2, 0x465d, 0x4743, 0x46b7, 0x47ba, 0x4636, 0x46c3, 0x4677, 0x4784, 0x485a, 0x47c2, 0x46dc, 0x46ac, 0x47de, 0x460e, 0x465f, 0x4834, 0x47f4, 0x4769, 0x46fc, 0x4810, 0x45fd, 0x45ea, 0x48d0, 0x45b5, 0x4704, 0x4783, 0x4830, 0x46c4, +0x4759, 0x47c7, 0x453d, 0x45b0, 0x4741, 0x4702, 0x4736, 0x4793, 0x461b, 0x47ba, 0x470b, 0x46dd, 0x4657, 0x470b, 0x470d, 0x4710, 0x486c, 0x468f, 0x45c3, 0x46ba, 0x479d, 0x483b, 0x46c9, 0x4774, 0x46a9, 0x46a7, 0x4833, 0x4606, 0x4690, 0x46a9, 0x46f5, 0x46a7, +0x47ac, 0x48bb, 0x452c, 0x4803, 0x470f, 0x4824, 0x47d5, 0x48cb, 0x4707, 0x484a, 0x4832, 0x4797, 0x4851, 0x482c, 0x487a, 0x4877, 0x4891, 0x465d, 0x47f4, 0x48ce, 0x4898, 0x4899, 0x484e, 0x486a, 0x47ac, 0x47f0, 0x493e, 0x4611, 0x47e2, 0x489e, 0x488c, 0x46af, +0x4665, 0x4836, 0x45e4, 0x46b6, 0x46a1, 0x46b9, 0x46c8, 0x46dd, 0x4658, 0x474b, 0x467b, 0x4777, 0x4769, 0x4798, 0x4785, 0x475e, 0x472a, 0x4656, 0x45fb, 0x4881, 0x46fc, 0x472d, 0x476e, 0x47a3, 0x465d, 0x46ca, 0x4855, 0x4500, 0x464f, 0x479a, 0x46c3, 0x4738, +0x481e, 0x486c, 0x4659, 0x4801, 0x4756, 0x477a, 0x47d5, 0x487b, 0x4706, 0x4808, 0x484f, 0x4838, 0x4870, 0x4863, 0x48d3, 0x4806, 0x4865, 0x4771, 0x46be, 0x494c, 0x4915, 0x484c, 0x4900, 0x4862, 0x481a, 0x46e8, 0x4974, 0x46a0, 0x4775, 0x483d, 0x487c, 0x480e +}; \ No newline at end of file diff --git a/hwpe/redmule/pulp_inject_fault.tcl b/hwpe/redmule/pulp_inject_fault.tcl new file mode 100644 index 0000000..61ccadf --- /dev/null +++ b/hwpe/redmule/pulp_inject_fault.tcl @@ -0,0 +1,53 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 +# +# Author: Michael Rogenmoser (michaero@iis.ee.ethz.ch) + +transcript quietly +if {! [info exists ::env(VSIM_PATH)]} {error "Define VSIM_PATH"} +set utils_base_path [file join $::env(VSIM_PATH) scripts fault_injection_utils] +set script_base_path [file join $::env(VSIM_PATH) fault_injection_sim scripts] + +set verbosity 2 +set log_injections 1 +# Easy way to generate a variable seed +# set seed [clock seconds] +# Default value +set seed 12345 +set print_statistics 1 + +set inject_start_time 550000000000ps +set inject_stop_time 750000000000ps +set injection_clock "pulp_cluster_tb/cluster_i/clk_i" +set injection_clock_trigger 0 +set fault_period 150 +set rand_initial_injection_phase 0 +# max_num set to 0 means until stop_time +set max_num_fault_inject 0 +set signal_fault_duration 20ns +set register_fault_duration 0ns + +set allow_multi_bit_upset $::env(MULTI_BIT_UPSET) +set use_bitwidth_as_weight 0 +set check_core_output_modification 0 +set check_core_next_state_modification 0 +set reg_to_sig_ratio 1 + +source [file join $utils_base_path pulp_extract_nets.tcl] + +set inject_signals_netlist [] +set inject_register_netlist [] +set output_netlist [] +set next_state_netlist [] +set assertion_disable_list [] + +# for {set idx 0} {$idx < 12} {incr idx} { +# set inject_signals_netlist [list {*}$inject_signals_netlist {*}[get_all_core_nets $idx]] +# set output_netlist [list {*}$output_netlist {*}[get_core_output_nets $idx]] +# } + +set inject_register_netlist [list {*}$inject_register_netlist {*}[get_memory_slice {0 16} {256 336}]] + +source [file join $script_base_path inject_fault.tcl] + diff --git a/hwpe/redmule/redmule.c b/hwpe/redmule/redmule.c new file mode 100644 index 0000000..61a4b2b --- /dev/null +++ b/hwpe/redmule/redmule.c @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2022-2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Yvan Tortorella + * + * RedMulE SW test + */ + +#include +#include "stdio.h" +#include "archi_redmule.h" +#include "hal_redmule.h" +#include "pulp.h" + +int main() { + + volatile int errors = 0; + unsigned int cluster_id = rt_cluster_id(); + #ifndef NO_ECC + unsigned int intc_data_correctable_cnt, redmule_data_correctable_cnt = 0; + unsigned int intc_meta_correctable_cnt = 0; + unsigned int intc_data_uncorrectable_cnt, redmule_data_uncorrectable_cnt = 0; + unsigned int intc_meta_uncorrectable_cnt = 0; + #endif + + if(get_core_id() == 0){ + + uint16_t m_size = M_SIZE; + uint16_t n_size = N_SIZE; + uint16_t k_size = K_SIZE; + + uint8_t *x_ext = x_inp; + uint8_t *w_ext = w_inp; + uint8_t *y_ext = y_inp; + uint8_t *z_ext = z_oup; + + uint8_t volatile *x = (uint8_t volatile *) pi_l1_malloc(0, (2*m_size*n_size)); + uint8_t volatile *w = (uint8_t volatile *) pi_l1_malloc(0, (2*n_size*k_size)); + uint8_t volatile *y = (uint8_t volatile *) pi_l1_malloc(0, (2*m_size*k_size)); + + #ifdef USE_DMA + volatile unsigned int dma_id = 0; + dma_id = mchan_alloc(); + mchan_transfer((unsigned int) 2*(2*m_size*n_size), + (unsigned int) x_ext, + (unsigned int) x ); + mchan_barrier(dma_id); + mchan_free(dma_id); + + dma_id = mchan_alloc(); + mchan_transfer((unsigned int) 2*(2*n_size*k_size), + (unsigned int) w_ext, + (unsigned int) w ); + mchan_barrier(dma_id); + mchan_free(dma_id); + + dma_id = mchan_alloc(); + mchan_transfer((unsigned int) 2*(2*m_size*k_size), + (unsigned int) y_ext, + (unsigned int) y ); + mchan_barrier(dma_id); + #else + generate_test_data16((int) x, (int) w, (int) y, (int) m_size, (int) n_size, (int) k_size); + #endif + + int gold_sum = 0, check_sum = 0; + int i,j; + + int offload_id_tmp, offload_id; + + // Enable RedMulE + hwpe_cg_enable(); + asm volatile("": : :"memory"); + + hwpe_soft_clear(); + asm volatile("": : :"memory"); + + // redmule_cfg ((unsigned int) x, + // (unsigned int) w, + // (unsigned int) y, + // m_size, n_size, k_size, + // (uint8_t) GEMM, + // (uint8_t) Float16); + redmule_x_add_set ((unsigned int) x); + redmule_w_add_set ((unsigned int) w); + redmule_y_add_set ((unsigned int) y); + redmule_z_add_set ((unsigned int) y); + redmule_cfg (m_size, n_size, k_size, gemm_ops); + + // Start RedMulE operation + hwpe_trigger_job(); + + // Wait for end of computation + redmule_evt_wait(); + + #ifndef NO_ECC + // Check number of detected errors by ECC modules inside RedMulE + redmule_data_correctable_cnt = redmule_get_data_correctable_count(); + redmule_data_uncorrectable_cnt = redmule_get_data_uncorrectable_count(); + #endif + + // Disable RedMulE + hwpe_cg_disable(); + + errors = redmule_compare16((int) y, (int) m_size, (int) k_size); + + *(int *) 0x1A1040A0 = errors; + + printf ("Terminated test with %d errors. See you!\n", errors); + + #ifndef NO_ECC + // Check number of detected errors by ECC modules inside interconnect + intc_data_correctable_cnt = hwpe_hci_ecc_get_data_correctable_count(cluster_id); + intc_meta_correctable_cnt = hwpe_hci_ecc_get_meta_correctable_count(cluster_id); + intc_data_uncorrectable_cnt = hwpe_hci_ecc_get_data_uncorrectable_count(cluster_id); + intc_meta_uncorrectable_cnt = hwpe_hci_ecc_get_meta_uncorrectable_count(cluster_id); + for (int i = 0; i < 16; i++) { + intc_meta_correctable_cnt += tcdm_scrubber_get_mismatch_count(cluster_id, i); + } + + printf ("Data errors corrected inside RedMulE: %d. Data errors uncorrectable inside RedMulE: %d \n", + redmule_data_correctable_cnt, redmule_data_uncorrectable_cnt); + printf("Data errors corrected inside intc: %d. Data errors uncorrectable inside intc: %d\n", + intc_data_correctable_cnt, intc_data_uncorrectable_cnt); + printf("Meta errors corrected inside intc: %d. Meta errors uncorrectable inside intc: %d\n", + intc_meta_correctable_cnt, intc_meta_uncorrectable_cnt); + #endif + + } + synch_barrier(); + #ifndef NO_ECC + return (errors != 0) && (redmule_data_uncorrectable_cnt==0 && intc_data_uncorrectable_cnt == 0 && intc_meta_uncorrectable_cnt == 0); + #else + return errors; + #endif +} diff --git a/hwpe/redmule_256iter/Makefile b/hwpe/redmule_256iter/Makefile new file mode 100644 index 0000000..88346b6 --- /dev/null +++ b/hwpe/redmule_256iter/Makefile @@ -0,0 +1,20 @@ +PULP_APP = test +PULP_APP_SRCS = redmule.c +PULP_CFLAGS = -O3 + +ifeq ($(use_dma),1) + PULP_CFLAGS += -DUSE_DMA +endif + +ifeq ($(fault_inject),1) + export FAULT_INJECTION=1 + export FAULT_INJECTION_SCRIPT=$(CURDIR)/pulp_inject_fault.tcl +endif + +ifeq ($(multi_bit_upset),1) + export MULTI_BIT_UPSET=1 +else + export MULTI_BIT_UPSET=0 +endif + +include $(PULP_SDK_HOME)/install/rules/pulp_rt.mk diff --git a/hwpe/redmule_256iter/archi_redmule.h b/hwpe/redmule_256iter/archi_redmule.h new file mode 100644 index 0000000..40eceee --- /dev/null +++ b/hwpe/redmule_256iter/archi_redmule.h @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2022-2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Yvan Tortorella + * + * High-level architecture of RedMulE + * + */ + +#ifndef __ARCHI_REDMULE_H__ +#define __ARCHI_REDMULE_H__ + +/* + * |========================================================================| + * || || + * ||Control and generic configuration register layout || + * |========================================================================| + * || # reg | offset | bits | bitmask || content || + * ||-------+----------+---------+--------------++-------------------------|| + * || 0 | 0x0000 | 31: 0 | 0xFFFFFFFF || TRIGGER || + * || 1 | 0x0004 | 31: 0 | 0xFFFFFFFF || ACQUIRE || + * || 2 | 0x0008 | 31: 0 | 0xFFFFFFFF || EVT_ENABLE || + * || 3 | 0x000c | 31: 0 | 0xFFFFFFFF || STATUS || + * || 4 | 0x0010 | 31: 0 | 0xFFFFFFFF || RUNNING_JOB || + * || 5 | 0x0014 | 31: 0 | 0xFFFFFFFF || SOFT_CLEAR || + * |========================================================================| + * || || + * ||Job-dependent registers layout || + * |========================================================================| + * || # reg | offset | bits | bitmask || content || + * ||-------+----------+---------+--------------++-------------------------|| + * || 0 | 0x0040 | 31: 0 | 0xFFFFFFFF || X_ADDR || + * ||-------+----------+---------+--------------++-------------------------|| + * || 1 | 0x0044 | 31: 0 | 0xFFFFFFFF || W_ADDR || + * ||-------+----------+---------+--------------++-------------------------|| + * || 2 | 0x0048 | 31: 0 | 0xFFFFFFFF || Z_ADDR || + * ||-------+----------+---------+--------------++-------------------------|| + * || 3 | 0x004C | | || Matrix Config 0 Reg || + * || | | 31:16 | 0xFFFF0000 || K Size (W Columns) || + * || | | 15: 0 | 0x0000FFFF || M Size (X Rows) || + * ||-------+----------+---------+--------------++-------------------------|| + * || 4 | 0x0050 | | || Matrix Config 1 Reg || + * || | | 31:16 | 0xFFFFFFFF || N Size (X Cols/W Rows) || + * ||-------+----------+---------+--------------++-------------------------|| + * || 5 | 0x0054 | | || Matrix Arithmetic Reg || + * || | | 12:10 | 0x00001C00 || Operation selection || + * || | | 9: 7 | 0x00000380 || Input/Output format || + * |========================================================================| + * + */ + +/* PULP Cluster Archi defines */ +#define ARCHI_CLUST_CTRL_BASE ARCHI_CLUSTER_CTRL_ADDR +#define ARCHI_CLUST_HWPE_BASE ARCHI_HWCE_ADDR +#define DMA_COMMAND_QUEUE ARCHI_MCHAN_DEMUX_ADDR +#define DMA_STATUS_REGISTER (ARCHI_MCHAN_DEMUX_ADDR + 4) +#define ARCHI_CL_HWPE_EVT0 12 +#define ARCHI_CL_HWPE_EVT1 13 +#define FC_DMA_EVENT 8 +#define CL_DMA_EVENT 22 +#define CLUST_CTRL_HWPE_EN 0x18 +#define CLUST_CTRL_HWPE_EN_MASK 0x800 +#define __builtin_bitinsert(a,b,c,d) (a | (((b << (32-c)) >> (32-c)) << d)) + +// RedMulE architecture +#define ADDR_WIDTH 32 +#define DATA_WIDTH 256 +#define REDMULE_FMT 16 +#define ARRAY_HEIGHT 4 +#define PIPE_REGS 3 +#define ARRAY_WIDTH 12 /* Superior limit is ARRAY_HEIGHT*PIPE_REGS */ + +// Commands +#define REDMULE_TRIGGER 0x00 +#define REDMULE_ACQUIRE 0x04 +#define REDMULE_FINISHED 0x08 +#define REDMULE_STATUS 0x0C +#define REDMULE_RUNNING_JOB 0x10 +#define REDMULE_SOFT_CLEAR 0x14 + +// Registers +#define REDMULE_REG_OFFS 0x40 +// #define REDMULE_REG_X_PTR 0x00 +// #define REDMULE_REG_W_PTR 0x04 +// #define REDMULE_REG_Z_PTR 0x08 +// #define REDMULE_MCFG0_PTR 0x0C +// #define REDMULE_MCFG1_PTR 0x10 +// #define REDMULE_ARITH_PTR 0x14 +#define REDMULE_REG_X_PTR 0x00 +#define REDMULE_REG_W_PTR 0x04 +#define REDMULE_REG_Y_PTR 0x08 +#define REDMULE_REG_Z_PTR 0x0C +#define REDMULE_REG_X_ITER_PTR 0x10 +#define REDMULE_REG_W_ITER_PTR 0x14 +#define REDMULE_REG_LEFTOVERS_PTR 0x18 +#define REDMULE_REG_LEFT_PARAMS_PTR 0x1C +#define REDMULE_REG_X_D1_STRIDE_PTR 0x20 +#define REDMULE_REG_W_TOT_LEN_PTR 0x24 +#define REDMULE_REG_TOT_X_READ_PTR 0x28 +#define REDMULE_REG_W_D0_STRIDE_PTR 0x2C +#define REDMULE_REG_YZ_TOT_LEN_PTR 0x30 +#define REDMULE_REG_YZ_D0_STRIDE_PTR 0x34 +#define REDMULE_REG_YZ_D2_STRIDE_PTR 0x38 +#define REDMULE_REG_X_ROWS_OFFS_PTR 0x3C +#define REDMULE_REG_X_BUFFER_SLOTS_PTR 0x40 +#define REDMULE_REG_X_TOT_LEN_PTR 0x44 +#define REDMULE_REG_OP_SELECTION 0x48 + +#define REDMULE_ECC_REG_OFFS 0x90 +#define DATA_CORR_ERR 0x00 +#define DATA_UNCORR_ERR 0x04 +#define METADATA_CORR_ERR 0x08 +#define METADATA_UNCORR_ERR 0x0c + +// OPs definition +#define MATMUL 0x0 +#define GEMM 0x1 +#define ADDMAX 0x2 +#define ADDMIN 0x3 +#define MULMAX 0x4 +#define MULMIN 0x5 +#define MAXMIN 0x6 +#define MINMAX 0x7 + +// GEMM formats +#define Float8 0x0 +#define Float16 0x1 +#define Float8Alt 0x2 +#define Float16Alt 0x3 + +#define RNE 0x0 +#define RTZ 0x1 +#define OP_FMADD 0x0 +#define OP_ADD 0x2 +#define OP_MUL 0x3 +#define OP_MINMAX 0x7 + +// FP Formats encoding +#define FP16 0x2 +#define FP8 0x3 +#define FP16ALT 0x4 +#define FP8ALT 0x5 + +/* DMA Archi */ +#define DMA_TX 0 +#define DMA_RX 1 +#define DMA_INC 1 + +#define PLP_DMA_TYPE_BIT 0x00000011 +#define PLP_DMA_INCR_BIT 0x00000012 +#define PLP_DMA_2D_BIT 0x00000013 +#define PLP_DMA_ELE_BIT 0x00000014 +#define PLP_DMA_ILE_BIT 0x00000015 +#define PLP_DMA_BLE_BIT 0x00000016 +#define PLP_DMA_2D_TCDM_BIT 0x0000017 + +#endif diff --git a/hwpe/redmule_256iter/hal_redmule.h b/hwpe/redmule_256iter/hal_redmule.h new file mode 100644 index 0000000..8fc5000 --- /dev/null +++ b/hwpe/redmule_256iter/hal_redmule.h @@ -0,0 +1,556 @@ +/* + * Copyright (C) 2022-2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Yvan Tortorella + * + * RedMulE Hardware Abstraction Layer (HAL) + */ + +#ifndef __HAL_REDMULE_H__ +#define __HAL_REDMULE_H__ + +#include +#include "inc/x_input.h" +#include "inc/w_input.h" +#include "inc/y_input.h" +#include "inc/z_output.h" +#include "inc/golden.h" +#include "inc/tensor_dim.h" + +/* + * + * For control, generic configuration register layout, + * and job-dependent register map, look at redmule_archi.h + * + */ + +// For all the following functions we use __builtin_pulp_OffsetedWrite and __builtin_pulp_OffsetedRead +// instead of classic load/store because otherwise the compiler is not able to correctly factorize +// the HWPE base in case several accesses are done, ending up with twice more code + +#define HWPE_WRITE(value, offset) *(int *)(ARCHI_CLUST_HWPE_BASE + offset) = value +#define HWPE_READ(offset) *(int *)(ARCHI_CLUST_HWPE_BASE + offset) + +static inline void redmule_x_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_X_PTR); +} + +static inline void redmule_w_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_W_PTR); +} + +static inline void redmule_y_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_Y_PTR); +} + +static inline void redmule_z_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_Z_PTR); +} + +// static inline void redmule_mcfg_set (uint32_t mcfg0, uint32_t mcfg1) { +// HWPE_WRITE(mcfg0, REDMULE_REG_OFFS + REDMULE_MCFG0_PTR); +// HWPE_WRITE(mcfg1, REDMULE_REG_OFFS + REDMULE_MCFG1_PTR); +// } +// +// static inline void redmule_arith_set (uint32_t arith) { +// HWPE_WRITE(arith, REDMULE_REG_OFFS + REDMULE_ARITH_PTR); +// } + +static inline void hwpe_trigger_job() { + HWPE_WRITE(0, REDMULE_TRIGGER); +} + +static inline int hwpe_acquire_job() { + return HWPE_READ(REDMULE_ACQUIRE); +} + +static inline unsigned int hwpe_get_status() { + return HWPE_READ(REDMULE_STATUS); +} + +static inline unsigned int hwpe_get_running_job() { + return HWPE_READ(REDMULE_RUNNING_JOB); +} + +static inline void hwpe_soft_clear() { + HWPE_WRITE(0, REDMULE_SOFT_CLEAR); +} + +static inline void hwpe_cg_enable() { + *(volatile int*) (ARCHI_CLUST_CTRL_BASE + CLUST_CTRL_HWPE_EN) |= CLUST_CTRL_HWPE_EN_MASK; +} + +static inline void hwpe_cg_disable() { + *(volatile int*) (ARCHI_CLUST_CTRL_BASE + CLUST_CTRL_HWPE_EN) &= ~CLUST_CTRL_HWPE_EN_MASK; +} + +static inline void redmule_evt_wait() { + do { + eu_evt_maskWaitAndClr (1 << ARCHI_CL_HWPE_EVT0); + } while((*(int volatile *)(ARCHI_CLUST_HWPE_BASE + REDMULE_STATUS)) != 0); +} + +static inline int hwpe_wait_acquire() { + int job_id = hwpe_acquire_job(); + while(job_id < 0) { + eu_evt_maskWaitAndClr (1 << ARCHI_CL_HWPE_EVT0); + job_id = hwpe_acquire_job(); + } + return job_id; +} + +static inline unsigned int redmule_get_data_correctable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + DATA_CORR_ERR); +} + +static inline unsigned int redmule_get_data_uncorrectable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + DATA_UNCORR_ERR); +} + +static inline unsigned int redmule_get_meta_correctable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + METADATA_CORR_ERR); +} + +static inline unsigned int redmule_get_meta_uncorrectable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + METADATA_UNCORR_ERR); +} + +/* DMA APIs */ +static inline int mchan_alloc(){ + return *(volatile int*) DMA_COMMAND_QUEUE; +} + +static inline void mchan_transfer(unsigned int len, + unsigned int ext_addr, + unsigned int tcdm_addr) { + + *(volatile int*) DMA_COMMAND_QUEUE = len | + (DMA_RX << PLP_DMA_TYPE_BIT) | + (DMA_INC << PLP_DMA_INCR_BIT) | + (0 << PLP_DMA_2D_BIT) | + (1 << PLP_DMA_ELE_BIT) | + (1 << PLP_DMA_ILE_BIT) | + (0 << PLP_DMA_BLE_BIT) | + (0 << PLP_DMA_2D_TCDM_BIT); + *(volatile int*) DMA_COMMAND_QUEUE = tcdm_addr; + *(volatile int*) DMA_COMMAND_QUEUE = ext_addr; +} + +static inline void mchan_barrier(int id) { + while(((*(volatile int*)(DMA_STATUS_REGISTER)) >> id ) & 0x1 ) { + eu_evt_maskWaitAndClr(1 << FC_DMA_EVENT); + } +} + +static inline void mchan_free(int id) { + *(volatile int*) DMA_STATUS_REGISTER = 0x1 << id; +} + +// void redmule_cfg (unsigned int x, unsigned int w, unsigned int z, +// uint16_t m_size, uint16_t n_size, uint16_t k_size, +// uint8_t gemm_op, uint8_t gemm_fmt){ +// +// uint32_t mcfg_reg0 = 0; +// uint32_t mcfg_reg1 = 0; +// uint32_t arith_reg = 0; +// +// mcfg_reg0 = (k_size << 16) | +// (m_size << 0); +// mcfg_reg1 = n_size << 0; +// +// arith_reg = (gemm_op << 10) | +// (gemm_fmt << 7); +// +// redmule_x_add_set ((unsigned int) x); +// redmule_w_add_set ((unsigned int) w); +// redmule_z_add_set ((unsigned int) z); +// redmule_mcfg_set ((unsigned int) mcfg_reg0, +// (unsigned int) mcfg_reg1); +// redmule_arith_set ((unsigned int) arith_reg); +// +// } + +void redmule_cfg (uint16_t m_size, uint16_t n_size, uint16_t k_size, uint8_t gemm_ops){ + uint32_t x_iters = 0; + uint32_t w_iters = 0; + uint32_t leftovers = 0; + uint32_t left_params = 0; + uint32_t x_d1_stride = 0; + uint32_t x_rows_offs = 0; + uint32_t w_tot_len = 0; + uint32_t w_d1_len = 0; + uint32_t w_d0_stride = 0; + uint32_t yz_tot_len = 0; + uint32_t yz_d0_stride = 0; + uint32_t yz_d2_stride = 0; + uint32_t tot_x_read = 0; + uint32_t x_buffer_slots = 0; + uint32_t op_selection = 0; + uint16_t tot_stores = 0; + uint16_t w_rows = n_size; + uint16_t depth = DATA_WIDTH/(ARRAY_HEIGHT*FPFORMAT); + uint8_t tile = ARRAY_HEIGHT*(PIPE_REGS + 1); + _Bool x_rows_sub = 0; + _Bool x_cols_sub = 0; + _Bool w_cols_sub = 0; + uint16_t x_rows_iter, + x_rows_iter_tmp, + w_rows_iter, + w_rows_iter_tmp; + uint16_t x_cols_iter, + x_cols_iter_tmp, + w_cols_iter, + w_cols_iter_tmp; + uint8_t x_rows_lftovr, + x_cols_lftovr, + w_rows_lftovr, + w_cols_lftovr, + slots; + + // Calculating the number of iterations alng the two dimensions of the X matrix + x_rows_iter_tmp = m_size/ARRAY_WIDTH; + x_cols_iter_tmp = n_size/tile; + + // Calculating the number of iterations alng the two dimensions of the W matrix + w_rows_iter_tmp = w_rows; + w_cols_iter_tmp = k_size/tile; + + // Calculating the residuals along the input dimensions + x_rows_lftovr = m_size - (x_rows_iter_tmp*ARRAY_WIDTH); + x_cols_lftovr = n_size - (x_cols_iter_tmp*tile); + + // Calculating the residuals along the weight dimensions + w_rows_lftovr = n_size - (ARRAY_HEIGHT*(w_rows/ARRAY_HEIGHT)); + w_cols_lftovr = k_size - (w_cols_iter_tmp*tile); + + if (w_cols_lftovr != 0) + w_cols_iter = w_cols_iter_tmp + 1; + else + w_cols_iter = w_cols_iter_tmp; + + if (w_rows_lftovr != 0) + w_rows_iter = w_rows_iter_tmp + ARRAY_HEIGHT - w_rows_lftovr; + else + w_rows_iter = w_rows_iter_tmp; + + if (x_cols_lftovr != 0) + x_cols_iter = x_cols_iter_tmp + 1; + else + x_cols_iter = x_cols_iter_tmp; + + if (x_rows_lftovr != 0) + x_rows_iter = x_rows_iter_tmp + 1; + else + x_rows_iter = x_rows_iter_tmp; + + if (x_cols_lftovr%depth != 0) + x_buffer_slots = x_cols_lftovr/depth + 1; + else + x_buffer_slots = x_cols_lftovr/depth; + + // Calculating the number of total stores + tot_stores = x_rows_iter*w_cols_iter; + + // Determining if input matrixes are sub-matrixes + if (m_size < ARRAY_WIDTH) + x_rows_sub = 1; + if (n_size < ARRAY_HEIGHT) + x_cols_sub = 1; + if (k_size < tile) + w_cols_sub = 1; + + // Operation selection + switch (gemm_ops) { + case MATMUL: + op_selection |= (RNE << 29 | RNE << 26 | OP_FMADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 0; + break; + + case GEMM: + op_selection |= (RNE << 29 | RNE << 26 | OP_FMADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case ADDMAX: + op_selection |= (RNE << 29 | RTZ << 26 | OP_ADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case ADDMIN: + op_selection |= (RNE << 29 | RNE << 26 | OP_ADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MULMAX: + op_selection |= (RNE << 29 | RTZ << 26 | OP_MUL << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MULMIN: + op_selection |= (RNE << 29 | RNE << 26 | OP_MUL << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MAXMIN: + op_selection |= (RTZ << 29 | RNE << 26 | OP_MINMAX << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MINMAX: + op_selection |= (RNE << 29 | RTZ << 26 | OP_MINMAX << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + } + + // Storing iterations and residuals in registers + x_iters |= x_rows_iter << 16 | x_cols_iter << 0; + w_iters |= w_rows_iter << 16 | w_cols_iter << 0; + leftovers |= x_rows_lftovr << 24 | x_cols_lftovr << 16 | w_rows_lftovr << 8 | w_cols_lftovr << 0; + left_params |= tot_stores << 16 | x_rows_sub << 15 | x_cols_sub << 14 | w_cols_sub << 13; + x_d1_stride = ((4*FPFORMAT)/ADDR_WIDTH)*(((DATA_WIDTH/FPFORMAT)*x_cols_iter_tmp) + x_cols_lftovr); + x_rows_offs = ARRAY_WIDTH*x_d1_stride; + w_tot_len = w_rows_iter*w_cols_iter*x_rows_iter; + w_d0_stride = ((4*FPFORMAT)/ADDR_WIDTH)*(((DATA_WIDTH/FPFORMAT)*w_cols_iter_tmp) + w_cols_lftovr); + yz_tot_len = ARRAY_WIDTH*x_rows_iter*w_cols_iter; + yz_d0_stride = w_d0_stride; + yz_d2_stride = ARRAY_WIDTH*w_d0_stride; + tot_x_read = x_rows_iter*x_cols_iter*w_cols_iter; + + // Writing the computations in configuration register + HWPE_WRITE(x_iters , REDMULE_REG_OFFS + REDMULE_REG_X_ITER_PTR ); + HWPE_WRITE(w_iters , REDMULE_REG_OFFS + REDMULE_REG_W_ITER_PTR ); + HWPE_WRITE(leftovers , REDMULE_REG_OFFS + REDMULE_REG_LEFTOVERS_PTR ); + HWPE_WRITE(left_params , REDMULE_REG_OFFS + REDMULE_REG_LEFT_PARAMS_PTR ); + HWPE_WRITE(x_d1_stride , REDMULE_REG_OFFS + REDMULE_REG_X_D1_STRIDE_PTR ); + HWPE_WRITE(x_rows_offs , REDMULE_REG_OFFS + REDMULE_REG_X_ROWS_OFFS_PTR ); + HWPE_WRITE(tot_x_read , REDMULE_REG_OFFS + REDMULE_REG_TOT_X_READ_PTR ); + HWPE_WRITE(x_buffer_slots, REDMULE_REG_OFFS + REDMULE_REG_X_BUFFER_SLOTS_PTR ); + HWPE_WRITE(w_tot_len , REDMULE_REG_OFFS + REDMULE_REG_W_TOT_LEN_PTR ); + HWPE_WRITE(w_d0_stride , REDMULE_REG_OFFS + REDMULE_REG_W_D0_STRIDE_PTR ); + HWPE_WRITE(yz_tot_len , REDMULE_REG_OFFS + REDMULE_REG_YZ_TOT_LEN_PTR ); + HWPE_WRITE(yz_d0_stride , REDMULE_REG_OFFS + REDMULE_REG_YZ_D0_STRIDE_PTR ); + HWPE_WRITE(yz_d2_stride , REDMULE_REG_OFFS + REDMULE_REG_YZ_D2_STRIDE_PTR ); + HWPE_WRITE(op_selection , REDMULE_REG_OFFS + REDMULE_REG_OP_SELECTION ); +} + +void generate_test_data16(int x_start_addr, + int w_start_addr, + int y_start_addr, + int m_size, + int n_size, + int k_size) { + + int x_addr = x_start_addr; + int w_addr = w_start_addr; + int y_addr = y_start_addr; + int x_end_addr = x_start_addr + (2*m_size*n_size); + int w_end_addr = w_start_addr + (2*n_size*k_size); + int y_end_addr = y_start_addr + (2*m_size*k_size); + + // Generating input stimuli from golden model + for (x_addr = x_start_addr; x_addr < x_end_addr; x_addr += 2) { + int x = x_addr - x_start_addr; + *(uint32_t *)(x_addr) = x_inp[x/2]; + } + + // Generating Weight stimuli from golden model + for (w_addr = w_start_addr; w_addr < w_end_addr; w_addr += 2) { + int w = w_addr - w_start_addr; + *(uint32_t *)(w_addr) = w_inp[w/2]; + } + + for (y_addr = y_start_addr; y_addr < y_end_addr; y_addr += 2) { + int y = y_addr - y_start_addr; + *(uint32_t *)(y_addr) = y_inp[y/2]; + } +} + +int redmule_compare16 (int z_start_addr, int m_size, int k_size) { + int err = 0; + int z_end_addr = z_start_addr + 2*m_size*k_size; + uint16_t z_computed; + uint16_t diff, diff_1, diff_2; + + for (int z_addr = z_start_addr; z_addr < z_end_addr; z_addr += 2) { + int z = z_addr - z_start_addr; + z_computed = *(uint32_t *)(z_addr); + + if ( z_computed != z_oup[z/2] ) { + diff_1 = z_computed - z_oup[z/2]; + if (diff_1 > 3) { + diff_2 = z_oup[z/2] - z_computed; + if (diff_2 > 3) { + err++; + } + } + } + } + + return err; + +} + +int redmule16_compare_int(uint32_t *actual_z, uint32_t *golden_z, int len) { + #define ERR 0x0011 + uint32_t actual_word = 0; + uint16_t actual_MSHWord, actual_LSHWord; + uint32_t golden_word = 0; + uint16_t golden_MSHWord, golden_LSHWord; + uint32_t actual = 0; + uint32_t golden = 0; + + int errors = 0; + int error; + + for (int i=0; i golden_LSHWord) ? (actual_LSHWord - golden_LSHWord) : 0; + diff = (actual_LSHWord < golden_LSHWord) ? (golden_LSHWord - actual_LSHWord) : 0; + + if (diff > ERR) { + error = 1; + #ifdef VERBOSE + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("LSW: Error!\n"); + #endif + } + + // Checking Most Significant Half-Word + actual_MSHWord = (uint16_t)((actual_word >> 16) & 0x0000FFFF); + golden_MSHWord = (uint16_t)((golden_word >> 16) & 0x0000FFFF); + + diff = (actual_MSHWord > golden_MSHWord) ? (actual_MSHWord - golden_MSHWord) : 0; + diff = (actual_MSHWord < golden_MSHWord) ? (golden_MSHWord - actual_MSHWord) : 0; + + if (diff > ERR) { + error = 1; + #ifdef VERBOSE + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("MSW: Error!\n"); + #endif + } + + errors += error; + + #ifdef DEBUG + tfp_printf(" Golden: 0x%08x; Actual: 0x%08x,\n", golden_word, actual_word); + #endif + + #ifdef VERBOSE + if(error) { + if(errors==1) tfp_printf(" golden <- actual @ address @ index\n"); + tfp_printf(" 0x%08x <- 0x%08x @ 0x%08x @ 0x%08x\n", golden_word, actual_word, (actual_z+i), i*4); + } + #endif + } + return errors; +} + +int redmule8_compare_int(uint32_t *actual_z, uint32_t *golden_z, int len) { + #define ERR 0x0011 + uint32_t actual_word = 0; + uint8_t actual_Byte0, + actual_Byte1, + actual_Byte2, + actual_Byte3; + uint32_t golden_word = 0; + uint8_t golden_Byte0, + golden_Byte1, + golden_Byte2, + golden_Byte3; + uint32_t actual = 0; + uint32_t golden = 0; + + int errors = 0; + int error; + + for (int i=0; i golden_Byte0) ? (actual_Byte0 - golden_Byte0) : 0; + diff = (actual_Byte0 < golden_Byte0) ? (golden_Byte0 - actual_Byte0) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte0: Error!\n"); + } + + // Cheching Byte1 + actual_Byte1 = (uint8_t)( (actual_word >> 8 ) & 0x000000FF); + golden_Byte1 = (uint8_t)( (golden_word >> 8 ) & 0x000000FF); + + diff = (actual_Byte1 > golden_Byte1) ? (actual_Byte1 - golden_Byte1) : 0; + diff = (actual_Byte1 < golden_Byte1) ? (golden_Byte1 - actual_Byte1) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte1: Error!\n"); + } + + // Cheching Byte2 + actual_Byte2 = (uint8_t)( (actual_word >> 16 ) & 0x000000FF); + golden_Byte2 = (uint8_t)( (golden_word >> 16 ) & 0x000000FF); + + diff = (actual_Byte2 > golden_Byte2) ? (actual_Byte2 - golden_Byte2) : 0; + diff = (actual_Byte2 < golden_Byte2) ? (golden_Byte2 - actual_Byte2) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte2: Error!\n"); + } + + // Cheching Byte3 + actual_Byte3 = (uint8_t)( (actual_word >> 24 ) & 0x000000FF); + golden_Byte3 = (uint8_t)( (golden_word >> 24 ) & 0x000000FF); + + diff = (actual_Byte3 > golden_Byte3) ? (actual_Byte3 - golden_Byte3) : 0; + diff = (actual_Byte3 < golden_Byte3) ? (golden_Byte3 - actual_Byte3) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte3: Error!\n"); + } + + errors += error; + + #ifdef DEBUG + tfp_printf(" Golden: 0x%08x; Actual: 0x%08x,\n", golden_word, actual_word); + #endif + + #ifdef VERBOSE + if(error) { + if(errors==1) tfp_printf(" golden <- actual @ address @ index\n"); + tfp_printf(" 0x%08x <- 0x%08x @ 0x%08x @ 0x%08x\n", golden_word, actual_word, (actual_z+i), i*4); + } + #endif + } + return errors; +} + +#endif diff --git a/hwpe/redmule_256iter/inc/golden.h b/hwpe/redmule_256iter/inc/golden.h new file mode 100644 index 0000000..f664e47 --- /dev/null +++ b/hwpe/redmule_256iter/inc/golden.h @@ -0,0 +1,387 @@ + /* Header file generated by RedMulE Golden Model */ +uint32_t golden [384] = { +0x48974845, +0x48384608, +0x487b4855, +0x48804869, +0x48b046d1, +0x483f48db, +0x485f48c9, +0x483a4881, +0x472c484b, +0x492b4762, +0x48fd4822, +0x492e488e, +0x484f483e, +0x46d749e8, +0x489d484b, +0x47e9490b, +0x47d2484f, +0x474744be, +0x46c047c7, +0x48af4727, +0x482d46c5, +0x482e483d, +0x479f4897, +0x4749488b, +0x46a8489a, +0x488b46f2, +0x47e84891, +0x483d4872, +0x46fd4716, +0x46a049b5, +0x47a446e7, +0x476748a1, +0x49354939, +0x48c14703, +0x48bd4863, +0x48cf4913, +0x48b848b6, +0x49204946, +0x48e1495e, +0x48b24938, +0x4882493a, +0x49d5483b, +0x49724911, +0x49df496b, +0x488848f2, +0x48214a46, +0x490c48c1, +0x48a349b2, +0x47b0463a, +0x476244cb, +0x46b94765, +0x4814466a, +0x47964631, +0x474b4666, +0x47044798, +0x47614838, +0x459047d3, +0x48a245ea, +0x484447f1, +0x4776484b, +0x46d847d6, +0x44d348f3, +0x478d46fa, +0x466e481e, +0x481e4827, +0x479445a2, +0x48064727, +0x48d5475d, +0x48284708, +0x480d4862, +0x48324895, +0x47f148bd, +0x46a7482a, +0x492d47b1, +0x4884484d, +0x485f48dc, +0x480c476d, +0x46d348e9, +0x48844728, +0x480e48a0, +0x48134862, +0x485a4675, +0x473847e8, +0x48234836, +0x482146e7, +0x47b34822, +0x48554846, +0x47174863, +0x47c14872, +0x488e46d5, +0x485f47e2, +0x48b8487c, +0x4788481e, +0x467748bd, +0x47f846c9, +0x47fc48fe, +0x47b247a0, +0x467e4588, +0x46c74662, +0x481246e8, +0x474e4536, +0x468f46c0, +0x4679481f, +0x46e246a1, +0x45604809, +0x47eb4630, +0x475746b5, +0x477f4848, +0x46d846a6, +0x459a4870, +0x46784670, +0x468c47d2, +0x48c44762, +0x479146e3, +0x486d46b1, +0x486747d0, +0x47f6468d, +0x475648a5, +0x48544857, +0x48384866, +0x46ec484d, +0x48f647d2, +0x4879484a, +0x483c4848, +0x4806471d, +0x473048fa, +0x47b84768, +0x46f94865, +0x491848a8, +0x486746ca, +0x48624800, +0x491048d3, +0x4849474e, +0x486b48eb, +0x48c54966, +0x483048f4, +0x477848f9, +0x499e481e, +0x48f148cf, +0x49234982, +0x47cf487c, +0x464949ea, +0x495e4773, +0x483f48b2, +0x497548a7, +0x481e4616, +0x4866481f, +0x486448b6, +0x487347dc, +0x487f485c, +0x491f4938, +0x48b6490d, +0x48a148f8, +0x492d4859, +0x4915489c, +0x48874899, +0x4859486c, +0x471e49ca, +0x49184867, +0x482748d3, +0x4998488b, +0x481d4704, +0x488048b8, +0x49444876, +0x48f2470c, +0x489b48b9, +0x48e54956, +0x48a548d6, +0x485648dc, +0x49ab484e, +0x490e48e0, +0x494548dd, +0x48dd488b, +0x47ea4a32, +0x49114835, +0x48194965, +0x481e460e, +0x4673452c, +0x4717475c, +0x46d046f6, +0x46bc4696, +0x481e4726, +0x46ea4763, +0x475846fe, +0x4627478b, +0x483f4704, +0x47b146ad, +0x48164792, +0x468446f2, +0x45a84827, +0x47a4472f, +0x462b4797, +0x48ab483f, +0x4863468f, +0x4766485a, +0x48cb481d, +0x490347dc, +0x483048fc, +0x483e48cc, +0x486448ab, +0x47634966, +0x499d4794, +0x488b488e, +0x496048dc, +0x484c4854, +0x474c499c, +0x48bc4826, +0x48834949, +0x4905489d, +0x481e4718, +0x48f448e3, +0x490448c1, +0x48b347e8, +0x48d44892, +0x489448ff, +0x488648d5, +0x480348fa, +0x492e47d2, +0x48b24870, +0x492b48e5, +0x4785487b, +0x471d49e3, +0x48bf4837, +0x48c4489b, +0x4871475c, +0x4811464a, +0x471c47af, +0x48174817, +0x484e463b, +0x464f477f, +0x487c4704, +0x472547a3, +0x462a4853, +0x4860465a, +0x48804736, +0x482b47e1, +0x46c04811, +0x475d48dc, +0x48064668, +0x46f44893, +0x49594858, +0x487b463d, +0x484e480f, +0x48a648c0, +0x48944847, +0x484a48a0, +0x48f4491e, +0x48b548fc, +0x47d248ce, +0x497f47db, +0x49394955, +0x48ce48a7, +0x48844890, +0x476349d6, +0x4922486e, +0x48c348f4, +0x491c47ec, +0x47834698, +0x47544715, +0x47524745, +0x4832472f, +0x48094817, +0x48c347f8, +0x480047e6, +0x473048b6, +0x48cb480a, +0x488e479e, +0x488e47c2, +0x47ee472f, +0x4744489d, +0x48514755, +0x47d34846, +0x48a04838, +0x47624634, +0x48064786, +0x482d47e3, +0x486c4726, +0x480347b7, +0x481448ac, +0x483948e0, +0x47504827, +0x48c546f2, +0x4886483f, +0x485648ad, +0x47a947e8, +0x47434937, +0x481f46d0, +0x4804484c, +0x481f47fd, +0x4813456d, +0x4807474d, +0x480e4688, +0x481046e8, +0x4799469f, +0x478f4853, +0x482447f2, +0x471f47d0, +0x485f46da, +0x481c4813, +0x4863482e, +0x480b4786, +0x46b848c9, +0x46e2475a, +0x46c54852, +0x480245af, +0x46c24466, +0x4743465d, +0x47ba46b7, +0x46c34636, +0x47844677, +0x47c2485a, +0x46ac46dc, +0x460e47de, +0x4834465f, +0x476947f4, +0x481046fc, +0x45ea45fd, +0x45b548d0, +0x47834704, +0x46c44830, +0x47c74759, +0x45b0453d, +0x47024741, +0x47934736, +0x47ba461b, +0x46dd470b, +0x470b4657, +0x4710470d, +0x468f486c, +0x46ba45c3, +0x483b479d, +0x477446c9, +0x46a746a9, +0x46064833, +0x46a94690, +0x46a746f5, +0x48bb47ac, +0x4803452c, +0x4824470f, +0x48cb47d5, +0x484a4707, +0x47974832, +0x482c4851, +0x4877487a, +0x465d4891, +0x48ce47f4, +0x48994898, +0x486a484e, +0x47f047ac, +0x4611493e, +0x489e47e2, +0x46af488c, +0x48364665, +0x46b645e4, +0x46b946a1, +0x46dd46c8, +0x474b4658, +0x4777467b, +0x47984769, +0x475e4785, +0x4656472a, +0x488145fb, +0x472d46fc, +0x47a3476e, +0x46ca465d, +0x45004855, +0x479a464f, +0x473846c3, +0x486c481e, +0x48014659, +0x477a4756, +0x487b47d5, +0x48084706, +0x4838484f, +0x48634870, +0x480648d3, +0x47714865, +0x494c46be, +0x484c4915, +0x48624900, +0x46e8481a, +0x46a04974, +0x483d4775, +0x480e487c, +}; \ No newline at end of file diff --git a/hwpe/redmule_256iter/inc/tensor_dim.h b/hwpe/redmule_256iter/inc/tensor_dim.h new file mode 100644 index 0000000..21bd0d8 --- /dev/null +++ b/hwpe/redmule_256iter/inc/tensor_dim.h @@ -0,0 +1,13 @@ + /* Header file generated by RedMulE Golden Model */ +#ifndef __TENSOR_DIM__ +#define __TENSOR_DIM__ + +#define M_SIZE 24 +#define N_SIZE 32 +#define K_SIZE 32 +#define SRC_FMT FP16 +#define DST_FMT FP16 +#define FPFORMAT 16 +uint8_t gemm_ops = GEMM; + +#endif diff --git a/hwpe/redmule_256iter/inc/w_2D.h b/hwpe/redmule_256iter/inc/w_2D.h new file mode 100644 index 0000000..9409c64 --- /dev/null +++ b/hwpe/redmule_256iter/inc/w_2D.h @@ -0,0 +1,35 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t w_inp_2D [32][32] = { +0x311a, 0x39e0, 0x387d, 0x3a4a, 0x386f, 0x3ada, 0x392f, 0x3854, 0x3014, 0x2fd2, 0x31c9, 0x2fca, 0x2e55, 0x3bc8, 0x396d, 0x3b1d, 0x39f6, 0x333a, 0x3908, 0x3628, 0x3bab, 0x3b8b, 0x3b4a, 0x322d, 0x3925, 0x317a, 0x3725, 0x31c2, 0x3066, 0x38f3, 0x3a17, 0x3476, +0x3bda, 0x3196, 0x3922, 0x3680, 0x396a, 0x3021, 0x3761, 0x374d, 0x2fc2, 0x3967, 0x3b94, 0x33b5, 0x3797, 0x34d6, 0x3655, 0x2176, 0x39bc, 0x3999, 0x3658, 0x3904, 0x3759, 0x2ade, 0x3a5a, 0x3b78, 0x36c7, 0x2d01, 0x3b58, 0x2d9a, 0x373d, 0x3952, 0x38e8, 0x3887, +0x37b6, 0x3a88, 0x2f8a, 0x2d79, 0x3413, 0x3421, 0x3976, 0x32b2, 0x3446, 0x2d99, 0x3a56, 0x3322, 0x3b49, 0x39fa, 0x3acd, 0x3af6, 0x304c, 0x3abb, 0x3a83, 0x38b2, 0x3ab9, 0x363e, 0x389f, 0x31bb, 0x38e1, 0x3bc4, 0x3b9b, 0x2984, 0x3a43, 0x3b2f, 0x35d6, 0x3bda, +0x2df3, 0x3bf8, 0x2acc, 0x378b, 0x3555, 0x2e59, 0x31d4, 0x34ec, 0x3a46, 0x3bab, 0x3214, 0x3161, 0x3470, 0x3a03, 0x368e, 0x31ad, 0x27cb, 0x2ecb, 0x3422, 0x39f7, 0x3644, 0x3a77, 0x313f, 0x34f2, 0x39b3, 0x3bf2, 0x379a, 0x3456, 0x35fe, 0x3ae7, 0x3964, 0x385f, +0x3b16, 0x3999, 0x3833, 0x2eda, 0x3afd, 0x3a4a, 0x3ba2, 0x2bd4, 0x3b38, 0x31a2, 0x32dd, 0x353c, 0x366f, 0x375e, 0x3821, 0x367a, 0x3b44, 0x39e6, 0x3787, 0x339e, 0x39d7, 0x38c6, 0x37d5, 0x342f, 0x3984, 0x319b, 0x33b5, 0x35ab, 0x398a, 0x374e, 0x36b6, 0x3b21, +0x3bbb, 0x2ab3, 0x2ad5, 0x33bc, 0x2bef, 0x3780, 0x3738, 0x3a0b, 0x3b09, 0x30ca, 0x384e, 0x3ab3, 0x39bd, 0x3453, 0x3a6d, 0x3957, 0x2c10, 0x30e9, 0x35d4, 0x3aef, 0x3be9, 0x39ad, 0x3a74, 0x3af9, 0x3739, 0x2d4d, 0x39fe, 0x3b72, 0x2c57, 0x398c, 0x381f, 0x3930, +0x3820, 0x321b, 0x3964, 0x2964, 0x33a0, 0x2d00, 0x2490, 0x336b, 0x3465, 0x3b2e, 0x3aa0, 0x371f, 0x300e, 0x3a09, 0x3bf1, 0x25cc, 0x3b6f, 0x3384, 0x3a88, 0x3acb, 0x3814, 0x36d0, 0x3081, 0x3a2c, 0x3353, 0x39cb, 0x31ed, 0x3af6, 0x3721, 0x36c7, 0x2ce2, 0x390d, +0x3698, 0x3ab2, 0x3b3e, 0x2eb4, 0x3998, 0x39e3, 0x3a77, 0x3632, 0x2c12, 0x3bd5, 0x3ba3, 0x3bba, 0x323c, 0x367b, 0x3557, 0x39c8, 0x37db, 0x3b45, 0x3b6e, 0x3931, 0x3121, 0x3a8d, 0x3a55, 0x3b9b, 0x358a, 0x3925, 0x3491, 0x3912, 0x3b6b, 0x3584, 0x32df, 0x3120, +0x32b2, 0x3b0a, 0x2cad, 0x3465, 0x3ad3, 0x3bcd, 0x363b, 0x3afe, 0x354b, 0x3374, 0x39af, 0x3b7f, 0x308c, 0x2e72, 0x3380, 0x3b70, 0x3902, 0x38d8, 0x39f3, 0x3a4b, 0x3853, 0x397b, 0x2ebe, 0x387f, 0x2845, 0x37e2, 0x360f, 0x370b, 0x3acb, 0x35d4, 0x36e6, 0x3262, +0x2e88, 0x3a54, 0x2ee3, 0x3575, 0x3afe, 0x2aee, 0x39a0, 0x3aae, 0x3693, 0x3432, 0x3834, 0x3b9b, 0x3bcb, 0x2e3a, 0x356d, 0x374e, 0x3924, 0x383c, 0x311e, 0x3ac5, 0x352d, 0x311e, 0x38ca, 0x34d4, 0x36ca, 0x34ed, 0x3a13, 0x33eb, 0x3639, 0x3828, 0x3b3c, 0x3939, +0x3837, 0x3521, 0x2cb5, 0x3629, 0x3924, 0x384c, 0x366a, 0x3bbf, 0x2e9e, 0x3ba8, 0x33ad, 0x38c8, 0x3934, 0x3907, 0x249a, 0x3690, 0x3a09, 0x3215, 0x3898, 0x325d, 0x37d5, 0x3195, 0x361c, 0x3ae4, 0x351f, 0x3452, 0x3bc0, 0x375c, 0x39bf, 0x317a, 0x3aae, 0x283a, +0x3476, 0x3b92, 0x3472, 0x383e, 0x280f, 0x39d6, 0x2fd1, 0x31f4, 0x2ffb, 0x3b97, 0x3692, 0x36c0, 0x3989, 0x33cf, 0x3ba6, 0x3239, 0x35d7, 0x33ab, 0x31eb, 0x3b47, 0x389b, 0x3b88, 0x3580, 0x354c, 0x3802, 0x3b9a, 0x3b94, 0x2a92, 0x2db1, 0x38bd, 0x2dfb, 0x3900, +0x344f, 0x3739, 0x27a5, 0x3b2e, 0x342b, 0x34bb, 0x30c8, 0x3ae8, 0x3b26, 0x3982, 0x38c0, 0x3408, 0x38c8, 0x36ef, 0x3bf0, 0x3acf, 0x3a3c, 0x3825, 0x31a5, 0x3ada, 0x3b5b, 0x37db, 0x3a01, 0x3663, 0x3a7d, 0x327b, 0x3a1f, 0x3862, 0x38af, 0x3204, 0x372e, 0x3b19, +0x3708, 0x3622, 0x2e62, 0x39ab, 0x2d4d, 0x31b4, 0x3552, 0x3bbc, 0x36f2, 0x36eb, 0x38ef, 0x3755, 0x3bbe, 0x2c17, 0x3815, 0x2f53, 0x363f, 0x38c1, 0x3246, 0x386b, 0x34de, 0x34e4, 0x3baa, 0x349e, 0x32ce, 0x3a68, 0x373f, 0x2cce, 0x3b36, 0x28ba, 0x3b50, 0x3232, +0x1f34, 0x3928, 0x35cd, 0x3b38, 0x30ce, 0x35a1, 0x3a06, 0x3a32, 0x3a53, 0x3489, 0x3241, 0x372f, 0x390c, 0x3a1b, 0x378a, 0x3713, 0x3769, 0x37a8, 0x3418, 0x3ad4, 0x3a4e, 0x3bf7, 0x37a5, 0x34dc, 0x39b2, 0x351b, 0x3372, 0x349f, 0x2f50, 0x3ab1, 0x3795, 0x2db7, +0x3864, 0x3157, 0x3900, 0x323e, 0x389e, 0x3880, 0x3b1f, 0x37a1, 0x396c, 0x2e43, 0x2c2a, 0x3b78, 0x3988, 0x3a14, 0x39c1, 0x3b51, 0x3780, 0x3bf2, 0x2d19, 0x3815, 0x3a5f, 0x3641, 0x2f62, 0x37d5, 0x3564, 0x139a, 0x3ab8, 0x28f7, 0x3785, 0x34e1, 0x3097, 0x3768, +0x3971, 0x3ae2, 0x32ae, 0x2fd5, 0x382a, 0x346c, 0x3133, 0x3167, 0x3940, 0x2d12, 0x389a, 0x3bd0, 0x3943, 0x391c, 0x3a75, 0x2a11, 0x391e, 0x372d, 0x3a79, 0x3b72, 0x3373, 0x39b7, 0x35d7, 0x372b, 0x3a6d, 0x38a1, 0x3279, 0x3434, 0x3694, 0x3b45, 0x3abb, 0x392d, +0x34a8, 0x3757, 0x32ca, 0x345d, 0x36a5, 0x3854, 0x2dcd, 0x30af, 0x38dd, 0x3067, 0x3411, 0x3997, 0x397a, 0x3a64, 0x38b8, 0x3962, 0x3509, 0x3bb6, 0x3a66, 0x339f, 0x372a, 0x31a8, 0x37da, 0x36ff, 0x33c6, 0x31da, 0x3977, 0x3b72, 0x3841, 0x3567, 0x3433, 0x33b8, +0x39fe, 0x3a10, 0x3bf2, 0x35e7, 0x3a4a, 0x3b3e, 0x2ec7, 0x3aa4, 0x3846, 0x3af9, 0x38a9, 0x2c1f, 0x39ab, 0x349f, 0x31d6, 0x39ae, 0x3b79, 0x352d, 0x3516, 0x347c, 0x2f33, 0x35ad, 0x31c4, 0x3b52, 0x354b, 0x3786, 0x3ab7, 0x3896, 0x34ac, 0x352f, 0x37e6, 0x326a, +0x2e44, 0x34c7, 0x388d, 0x3bf4, 0x363f, 0x3b3d, 0x33b1, 0x3b8b, 0x3340, 0x37f7, 0x3b07, 0x25bf, 0x398e, 0x3505, 0x3bd7, 0x366d, 0x388a, 0x2cc0, 0x359a, 0x3b9a, 0x3b99, 0x379d, 0x3b6b, 0x39b8, 0x3223, 0x2703, 0x3ba9, 0x2ecb, 0x3759, 0x39d8, 0x37ac, 0x32cf, +0x35f2, 0x38a3, 0x399e, 0x3bd2, 0x3780, 0x3af3, 0x3b5e, 0x337b, 0x3a08, 0x35da, 0x3446, 0x3b25, 0x3ad0, 0x3bee, 0x3141, 0x32d8, 0x34ce, 0x2ac9, 0x3800, 0x3a8a, 0x2d53, 0x368a, 0x3561, 0x3998, 0x35a3, 0x3677, 0x3ab2, 0x3269, 0x3236, 0x3b3e, 0x3aba, 0x3bac, +0x395d, 0x3820, 0x1df6, 0x3bb5, 0x35b5, 0x3675, 0x3b74, 0x360f, 0x34de, 0x3a0c, 0x3aeb, 0x299d, 0x3207, 0x3bd8, 0x2178, 0x3995, 0x3948, 0x3908, 0x3843, 0x2ea5, 0x3045, 0x3989, 0x345d, 0x39c5, 0x3a89, 0x3863, 0x3be0, 0x397a, 0x38f1, 0x39e2, 0x3b08, 0x352e, +0x385f, 0x28f2, 0x3bc3, 0x35e0, 0x380c, 0x3b9c, 0x3afc, 0x390a, 0x3689, 0x34fd, 0x2cf5, 0x308e, 0x342b, 0x3921, 0x3a67, 0x3ad6, 0x2986, 0x32fc, 0x35aa, 0x3507, 0x3608, 0x33fd, 0x3bf3, 0x39e2, 0x3b0f, 0x30b7, 0x3896, 0x3ae4, 0x2145, 0x35b6, 0x2e1d, 0x3ad1, +0x333d, 0x3afb, 0x2703, 0x3413, 0x1d7d, 0x3b7f, 0x3ae1, 0x303c, 0x3004, 0x39d3, 0x3554, 0x31a4, 0x354e, 0x3662, 0x39c5, 0x2eb7, 0x2c6e, 0x397f, 0x31d8, 0x1f0c, 0x38e3, 0x35f0, 0x2714, 0x28d1, 0x375e, 0x3a75, 0x3830, 0x3578, 0x397d, 0x3b18, 0x383c, 0x3498, +0x39ad, 0x3598, 0x23c4, 0x34ea, 0x3a61, 0x2b00, 0x3707, 0x3ae1, 0x37ae, 0x389d, 0x37fa, 0x3673, 0x3278, 0xf3e, 0x3809, 0x33c6, 0x3bf5, 0x3279, 0x3816, 0x360c, 0x39c8, 0x381f, 0x3741, 0x2d66, 0x38c0, 0x37d3, 0x377a, 0x3621, 0x2faf, 0x392e, 0x2de6, 0x33c5, +0x3803, 0x2600, 0x32e9, 0x39b4, 0x38d2, 0x34e8, 0x2fe6, 0x3199, 0x3643, 0x3a77, 0x27cc, 0x39d7, 0x34c6, 0x2ea8, 0x364e, 0x3b07, 0x31c7, 0x30a1, 0x31b1, 0x3b8f, 0x3571, 0x3b75, 0x3989, 0x3805, 0x39fb, 0x3945, 0x352b, 0x31d8, 0x3904, 0x3440, 0x3a57, 0x2cf7, +0x3b39, 0x2fcd, 0x2b89, 0x2edd, 0x3682, 0x36a9, 0x32c8, 0x37ac, 0x32a5, 0x3311, 0x394b, 0x3b84, 0x3aec, 0x3601, 0x2765, 0x3b69, 0x396b, 0x3727, 0x3bfe, 0x3907, 0x376f, 0x3674, 0x3973, 0x3671, 0x3491, 0x3993, 0x383f, 0x3335, 0x3989, 0x3550, 0x3077, 0x35f5, +0x3a59, 0x3950, 0x380c, 0x37cd, 0x30bf, 0x3607, 0x3afa, 0x3b5d, 0x32b9, 0x386b, 0x35bd, 0x3aca, 0x3ba5, 0x3b2d, 0x3b19, 0x3b8b, 0x345e, 0x2845, 0x34aa, 0x372a, 0x3448, 0x34f5, 0x3ae2, 0x3637, 0x2cb5, 0x354b, 0x3b15, 0x2ca8, 0x2641, 0x3178, 0x2cfe, 0x39b4, +0x3bdd, 0x3acb, 0x3a05, 0x38a2, 0x3b4a, 0x34e5, 0x395f, 0x394b, 0x34c4, 0x3aa5, 0x29bb, 0x2d96, 0x339d, 0x387c, 0x382e, 0x385a, 0x396b, 0x3aa9, 0x2f1e, 0x33a7, 0x3b90, 0x3b7b, 0x3b5f, 0x39d3, 0x3b18, 0x354f, 0x2cdb, 0x3a6f, 0x3434, 0x34ff, 0x3a5b, 0x3b84, +0x3a33, 0x384b, 0x2e67, 0x3b85, 0x3853, 0x380c, 0x346a, 0x3aaa, 0x3492, 0x33e8, 0x3bf2, 0x38ae, 0x3a29, 0x3830, 0x3221, 0x35b1, 0x3a48, 0x2c68, 0x2ced, 0x3a7e, 0x3539, 0x3922, 0x374c, 0x3aaa, 0x2dae, 0x395d, 0x3b3d, 0x3890, 0x2cfe, 0x2dd6, 0x3bad, 0x33c5, +0x2c07, 0x3a2c, 0x37a8, 0x390f, 0x2fc8, 0x35ae, 0x388c, 0x30ee, 0x3674, 0x391d, 0x3bfc, 0x36bf, 0x322d, 0x3a78, 0x35c0, 0x3492, 0x3ac8, 0x3504, 0x3315, 0x381d, 0x3a7a, 0x3a08, 0x343c, 0x3bda, 0x341b, 0x39f0, 0x3b9e, 0x395d, 0x3c00, 0x38ab, 0x3bcf, 0x3564, +0x33c4, 0x3b0d, 0x3623, 0x33b9, 0x3b92, 0x1e71, 0x2c57, 0x36d0, 0x314b, 0x3a16, 0x3372, 0x341b, 0x3aaa, 0x3444, 0x396b, 0x2dd7, 0x3b30, 0x3559, 0x3b5b, 0x3a29, 0x2d19, 0x38b7, 0x3b01, 0x3afa, 0x398a, 0x3839, 0x3ac9, 0x2e31, 0x3924, 0x39f2, 0x3a7f, 0x3285 +}; \ No newline at end of file diff --git a/hwpe/redmule_256iter/inc/w_input.h b/hwpe/redmule_256iter/inc/w_input.h new file mode 100644 index 0000000..dc4d3be --- /dev/null +++ b/hwpe/redmule_256iter/inc/w_input.h @@ -0,0 +1,35 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t w_inp [1024] = { +0x311a, 0x39e0, 0x387d, 0x3a4a, 0x386f, 0x3ada, 0x392f, 0x3854, 0x3014, 0x2fd2, 0x31c9, 0x2fca, 0x2e55, 0x3bc8, 0x396d, 0x3b1d, 0x39f6, 0x333a, 0x3908, 0x3628, 0x3bab, 0x3b8b, 0x3b4a, 0x322d, 0x3925, 0x317a, 0x3725, 0x31c2, 0x3066, 0x38f3, 0x3a17, 0x3476, +0x3bda, 0x3196, 0x3922, 0x3680, 0x396a, 0x3021, 0x3761, 0x374d, 0x2fc2, 0x3967, 0x3b94, 0x33b5, 0x3797, 0x34d6, 0x3655, 0x2176, 0x39bc, 0x3999, 0x3658, 0x3904, 0x3759, 0x2ade, 0x3a5a, 0x3b78, 0x36c7, 0x2d01, 0x3b58, 0x2d9a, 0x373d, 0x3952, 0x38e8, 0x3887, +0x37b6, 0x3a88, 0x2f8a, 0x2d79, 0x3413, 0x3421, 0x3976, 0x32b2, 0x3446, 0x2d99, 0x3a56, 0x3322, 0x3b49, 0x39fa, 0x3acd, 0x3af6, 0x304c, 0x3abb, 0x3a83, 0x38b2, 0x3ab9, 0x363e, 0x389f, 0x31bb, 0x38e1, 0x3bc4, 0x3b9b, 0x2984, 0x3a43, 0x3b2f, 0x35d6, 0x3bda, +0x2df3, 0x3bf8, 0x2acc, 0x378b, 0x3555, 0x2e59, 0x31d4, 0x34ec, 0x3a46, 0x3bab, 0x3214, 0x3161, 0x3470, 0x3a03, 0x368e, 0x31ad, 0x27cb, 0x2ecb, 0x3422, 0x39f7, 0x3644, 0x3a77, 0x313f, 0x34f2, 0x39b3, 0x3bf2, 0x379a, 0x3456, 0x35fe, 0x3ae7, 0x3964, 0x385f, +0x3b16, 0x3999, 0x3833, 0x2eda, 0x3afd, 0x3a4a, 0x3ba2, 0x2bd4, 0x3b38, 0x31a2, 0x32dd, 0x353c, 0x366f, 0x375e, 0x3821, 0x367a, 0x3b44, 0x39e6, 0x3787, 0x339e, 0x39d7, 0x38c6, 0x37d5, 0x342f, 0x3984, 0x319b, 0x33b5, 0x35ab, 0x398a, 0x374e, 0x36b6, 0x3b21, +0x3bbb, 0x2ab3, 0x2ad5, 0x33bc, 0x2bef, 0x3780, 0x3738, 0x3a0b, 0x3b09, 0x30ca, 0x384e, 0x3ab3, 0x39bd, 0x3453, 0x3a6d, 0x3957, 0x2c10, 0x30e9, 0x35d4, 0x3aef, 0x3be9, 0x39ad, 0x3a74, 0x3af9, 0x3739, 0x2d4d, 0x39fe, 0x3b72, 0x2c57, 0x398c, 0x381f, 0x3930, +0x3820, 0x321b, 0x3964, 0x2964, 0x33a0, 0x2d00, 0x2490, 0x336b, 0x3465, 0x3b2e, 0x3aa0, 0x371f, 0x300e, 0x3a09, 0x3bf1, 0x25cc, 0x3b6f, 0x3384, 0x3a88, 0x3acb, 0x3814, 0x36d0, 0x3081, 0x3a2c, 0x3353, 0x39cb, 0x31ed, 0x3af6, 0x3721, 0x36c7, 0x2ce2, 0x390d, +0x3698, 0x3ab2, 0x3b3e, 0x2eb4, 0x3998, 0x39e3, 0x3a77, 0x3632, 0x2c12, 0x3bd5, 0x3ba3, 0x3bba, 0x323c, 0x367b, 0x3557, 0x39c8, 0x37db, 0x3b45, 0x3b6e, 0x3931, 0x3121, 0x3a8d, 0x3a55, 0x3b9b, 0x358a, 0x3925, 0x3491, 0x3912, 0x3b6b, 0x3584, 0x32df, 0x3120, +0x32b2, 0x3b0a, 0x2cad, 0x3465, 0x3ad3, 0x3bcd, 0x363b, 0x3afe, 0x354b, 0x3374, 0x39af, 0x3b7f, 0x308c, 0x2e72, 0x3380, 0x3b70, 0x3902, 0x38d8, 0x39f3, 0x3a4b, 0x3853, 0x397b, 0x2ebe, 0x387f, 0x2845, 0x37e2, 0x360f, 0x370b, 0x3acb, 0x35d4, 0x36e6, 0x3262, +0x2e88, 0x3a54, 0x2ee3, 0x3575, 0x3afe, 0x2aee, 0x39a0, 0x3aae, 0x3693, 0x3432, 0x3834, 0x3b9b, 0x3bcb, 0x2e3a, 0x356d, 0x374e, 0x3924, 0x383c, 0x311e, 0x3ac5, 0x352d, 0x311e, 0x38ca, 0x34d4, 0x36ca, 0x34ed, 0x3a13, 0x33eb, 0x3639, 0x3828, 0x3b3c, 0x3939, +0x3837, 0x3521, 0x2cb5, 0x3629, 0x3924, 0x384c, 0x366a, 0x3bbf, 0x2e9e, 0x3ba8, 0x33ad, 0x38c8, 0x3934, 0x3907, 0x249a, 0x3690, 0x3a09, 0x3215, 0x3898, 0x325d, 0x37d5, 0x3195, 0x361c, 0x3ae4, 0x351f, 0x3452, 0x3bc0, 0x375c, 0x39bf, 0x317a, 0x3aae, 0x283a, +0x3476, 0x3b92, 0x3472, 0x383e, 0x280f, 0x39d6, 0x2fd1, 0x31f4, 0x2ffb, 0x3b97, 0x3692, 0x36c0, 0x3989, 0x33cf, 0x3ba6, 0x3239, 0x35d7, 0x33ab, 0x31eb, 0x3b47, 0x389b, 0x3b88, 0x3580, 0x354c, 0x3802, 0x3b9a, 0x3b94, 0x2a92, 0x2db1, 0x38bd, 0x2dfb, 0x3900, +0x344f, 0x3739, 0x27a5, 0x3b2e, 0x342b, 0x34bb, 0x30c8, 0x3ae8, 0x3b26, 0x3982, 0x38c0, 0x3408, 0x38c8, 0x36ef, 0x3bf0, 0x3acf, 0x3a3c, 0x3825, 0x31a5, 0x3ada, 0x3b5b, 0x37db, 0x3a01, 0x3663, 0x3a7d, 0x327b, 0x3a1f, 0x3862, 0x38af, 0x3204, 0x372e, 0x3b19, +0x3708, 0x3622, 0x2e62, 0x39ab, 0x2d4d, 0x31b4, 0x3552, 0x3bbc, 0x36f2, 0x36eb, 0x38ef, 0x3755, 0x3bbe, 0x2c17, 0x3815, 0x2f53, 0x363f, 0x38c1, 0x3246, 0x386b, 0x34de, 0x34e4, 0x3baa, 0x349e, 0x32ce, 0x3a68, 0x373f, 0x2cce, 0x3b36, 0x28ba, 0x3b50, 0x3232, +0x1f34, 0x3928, 0x35cd, 0x3b38, 0x30ce, 0x35a1, 0x3a06, 0x3a32, 0x3a53, 0x3489, 0x3241, 0x372f, 0x390c, 0x3a1b, 0x378a, 0x3713, 0x3769, 0x37a8, 0x3418, 0x3ad4, 0x3a4e, 0x3bf7, 0x37a5, 0x34dc, 0x39b2, 0x351b, 0x3372, 0x349f, 0x2f50, 0x3ab1, 0x3795, 0x2db7, +0x3864, 0x3157, 0x3900, 0x323e, 0x389e, 0x3880, 0x3b1f, 0x37a1, 0x396c, 0x2e43, 0x2c2a, 0x3b78, 0x3988, 0x3a14, 0x39c1, 0x3b51, 0x3780, 0x3bf2, 0x2d19, 0x3815, 0x3a5f, 0x3641, 0x2f62, 0x37d5, 0x3564, 0x139a, 0x3ab8, 0x28f7, 0x3785, 0x34e1, 0x3097, 0x3768, +0x3971, 0x3ae2, 0x32ae, 0x2fd5, 0x382a, 0x346c, 0x3133, 0x3167, 0x3940, 0x2d12, 0x389a, 0x3bd0, 0x3943, 0x391c, 0x3a75, 0x2a11, 0x391e, 0x372d, 0x3a79, 0x3b72, 0x3373, 0x39b7, 0x35d7, 0x372b, 0x3a6d, 0x38a1, 0x3279, 0x3434, 0x3694, 0x3b45, 0x3abb, 0x392d, +0x34a8, 0x3757, 0x32ca, 0x345d, 0x36a5, 0x3854, 0x2dcd, 0x30af, 0x38dd, 0x3067, 0x3411, 0x3997, 0x397a, 0x3a64, 0x38b8, 0x3962, 0x3509, 0x3bb6, 0x3a66, 0x339f, 0x372a, 0x31a8, 0x37da, 0x36ff, 0x33c6, 0x31da, 0x3977, 0x3b72, 0x3841, 0x3567, 0x3433, 0x33b8, +0x39fe, 0x3a10, 0x3bf2, 0x35e7, 0x3a4a, 0x3b3e, 0x2ec7, 0x3aa4, 0x3846, 0x3af9, 0x38a9, 0x2c1f, 0x39ab, 0x349f, 0x31d6, 0x39ae, 0x3b79, 0x352d, 0x3516, 0x347c, 0x2f33, 0x35ad, 0x31c4, 0x3b52, 0x354b, 0x3786, 0x3ab7, 0x3896, 0x34ac, 0x352f, 0x37e6, 0x326a, +0x2e44, 0x34c7, 0x388d, 0x3bf4, 0x363f, 0x3b3d, 0x33b1, 0x3b8b, 0x3340, 0x37f7, 0x3b07, 0x25bf, 0x398e, 0x3505, 0x3bd7, 0x366d, 0x388a, 0x2cc0, 0x359a, 0x3b9a, 0x3b99, 0x379d, 0x3b6b, 0x39b8, 0x3223, 0x2703, 0x3ba9, 0x2ecb, 0x3759, 0x39d8, 0x37ac, 0x32cf, +0x35f2, 0x38a3, 0x399e, 0x3bd2, 0x3780, 0x3af3, 0x3b5e, 0x337b, 0x3a08, 0x35da, 0x3446, 0x3b25, 0x3ad0, 0x3bee, 0x3141, 0x32d8, 0x34ce, 0x2ac9, 0x3800, 0x3a8a, 0x2d53, 0x368a, 0x3561, 0x3998, 0x35a3, 0x3677, 0x3ab2, 0x3269, 0x3236, 0x3b3e, 0x3aba, 0x3bac, +0x395d, 0x3820, 0x1df6, 0x3bb5, 0x35b5, 0x3675, 0x3b74, 0x360f, 0x34de, 0x3a0c, 0x3aeb, 0x299d, 0x3207, 0x3bd8, 0x2178, 0x3995, 0x3948, 0x3908, 0x3843, 0x2ea5, 0x3045, 0x3989, 0x345d, 0x39c5, 0x3a89, 0x3863, 0x3be0, 0x397a, 0x38f1, 0x39e2, 0x3b08, 0x352e, +0x385f, 0x28f2, 0x3bc3, 0x35e0, 0x380c, 0x3b9c, 0x3afc, 0x390a, 0x3689, 0x34fd, 0x2cf5, 0x308e, 0x342b, 0x3921, 0x3a67, 0x3ad6, 0x2986, 0x32fc, 0x35aa, 0x3507, 0x3608, 0x33fd, 0x3bf3, 0x39e2, 0x3b0f, 0x30b7, 0x3896, 0x3ae4, 0x2145, 0x35b6, 0x2e1d, 0x3ad1, +0x333d, 0x3afb, 0x2703, 0x3413, 0x1d7d, 0x3b7f, 0x3ae1, 0x303c, 0x3004, 0x39d3, 0x3554, 0x31a4, 0x354e, 0x3662, 0x39c5, 0x2eb7, 0x2c6e, 0x397f, 0x31d8, 0x1f0c, 0x38e3, 0x35f0, 0x2714, 0x28d1, 0x375e, 0x3a75, 0x3830, 0x3578, 0x397d, 0x3b18, 0x383c, 0x3498, +0x39ad, 0x3598, 0x23c4, 0x34ea, 0x3a61, 0x2b00, 0x3707, 0x3ae1, 0x37ae, 0x389d, 0x37fa, 0x3673, 0x3278, 0xf3e, 0x3809, 0x33c6, 0x3bf5, 0x3279, 0x3816, 0x360c, 0x39c8, 0x381f, 0x3741, 0x2d66, 0x38c0, 0x37d3, 0x377a, 0x3621, 0x2faf, 0x392e, 0x2de6, 0x33c5, +0x3803, 0x2600, 0x32e9, 0x39b4, 0x38d2, 0x34e8, 0x2fe6, 0x3199, 0x3643, 0x3a77, 0x27cc, 0x39d7, 0x34c6, 0x2ea8, 0x364e, 0x3b07, 0x31c7, 0x30a1, 0x31b1, 0x3b8f, 0x3571, 0x3b75, 0x3989, 0x3805, 0x39fb, 0x3945, 0x352b, 0x31d8, 0x3904, 0x3440, 0x3a57, 0x2cf7, +0x3b39, 0x2fcd, 0x2b89, 0x2edd, 0x3682, 0x36a9, 0x32c8, 0x37ac, 0x32a5, 0x3311, 0x394b, 0x3b84, 0x3aec, 0x3601, 0x2765, 0x3b69, 0x396b, 0x3727, 0x3bfe, 0x3907, 0x376f, 0x3674, 0x3973, 0x3671, 0x3491, 0x3993, 0x383f, 0x3335, 0x3989, 0x3550, 0x3077, 0x35f5, +0x3a59, 0x3950, 0x380c, 0x37cd, 0x30bf, 0x3607, 0x3afa, 0x3b5d, 0x32b9, 0x386b, 0x35bd, 0x3aca, 0x3ba5, 0x3b2d, 0x3b19, 0x3b8b, 0x345e, 0x2845, 0x34aa, 0x372a, 0x3448, 0x34f5, 0x3ae2, 0x3637, 0x2cb5, 0x354b, 0x3b15, 0x2ca8, 0x2641, 0x3178, 0x2cfe, 0x39b4, +0x3bdd, 0x3acb, 0x3a05, 0x38a2, 0x3b4a, 0x34e5, 0x395f, 0x394b, 0x34c4, 0x3aa5, 0x29bb, 0x2d96, 0x339d, 0x387c, 0x382e, 0x385a, 0x396b, 0x3aa9, 0x2f1e, 0x33a7, 0x3b90, 0x3b7b, 0x3b5f, 0x39d3, 0x3b18, 0x354f, 0x2cdb, 0x3a6f, 0x3434, 0x34ff, 0x3a5b, 0x3b84, +0x3a33, 0x384b, 0x2e67, 0x3b85, 0x3853, 0x380c, 0x346a, 0x3aaa, 0x3492, 0x33e8, 0x3bf2, 0x38ae, 0x3a29, 0x3830, 0x3221, 0x35b1, 0x3a48, 0x2c68, 0x2ced, 0x3a7e, 0x3539, 0x3922, 0x374c, 0x3aaa, 0x2dae, 0x395d, 0x3b3d, 0x3890, 0x2cfe, 0x2dd6, 0x3bad, 0x33c5, +0x2c07, 0x3a2c, 0x37a8, 0x390f, 0x2fc8, 0x35ae, 0x388c, 0x30ee, 0x3674, 0x391d, 0x3bfc, 0x36bf, 0x322d, 0x3a78, 0x35c0, 0x3492, 0x3ac8, 0x3504, 0x3315, 0x381d, 0x3a7a, 0x3a08, 0x343c, 0x3bda, 0x341b, 0x39f0, 0x3b9e, 0x395d, 0x3c00, 0x38ab, 0x3bcf, 0x3564, +0x33c4, 0x3b0d, 0x3623, 0x33b9, 0x3b92, 0x1e71, 0x2c57, 0x36d0, 0x314b, 0x3a16, 0x3372, 0x341b, 0x3aaa, 0x3444, 0x396b, 0x2dd7, 0x3b30, 0x3559, 0x3b5b, 0x3a29, 0x2d19, 0x38b7, 0x3b01, 0x3afa, 0x398a, 0x3839, 0x3ac9, 0x2e31, 0x3924, 0x39f2, 0x3a7f, 0x3285 +}; \ No newline at end of file diff --git a/hwpe/redmule_256iter/inc/x_2D.h b/hwpe/redmule_256iter/inc/x_2D.h new file mode 100644 index 0000000..0b589f8 --- /dev/null +++ b/hwpe/redmule_256iter/inc/x_2D.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t x_inp_2D [24][32] = { +0x2153, 0x3bb5, 0x3896, 0x365f, 0x2483, 0x3518, 0x2dd1, 0x3bca, 0x397b, 0x29b1, 0x3705, 0x36c8, 0x398b, 0x3661, 0x2f05, 0x365a, 0x3bf9, 0x34df, 0x363b, 0x38d9, 0x39c6, 0x3abb, 0x3952, 0x38f2, 0x392d, 0x3b3e, 0x2afb, 0x3a9d, 0x353b, 0x3b73, 0x3a01, 0x3679, +0x3934, 0x397d, 0x2904, 0x3822, 0x3462, 0x3b44, 0x39e9, 0x28be, 0x331e, 0x3a1d, 0x39e5, 0x34da, 0x3a19, 0x3906, 0x1d35, 0x3871, 0x31e7, 0x3b29, 0x325d, 0x3797, 0x2b2f, 0x38b4, 0x232f, 0x38aa, 0x3aca, 0x316f, 0x3811, 0x3950, 0x32ea, 0x3bc7, 0x382c, 0x38a2, +0x29ce, 0x3afa, 0x3a39, 0x2ccc, 0x39fd, 0x3b3d, 0x384a, 0x3a35, 0x3802, 0x366a, 0x37ec, 0x3598, 0x3bf8, 0x3a85, 0x3a1b, 0x386e, 0x3b4c, 0x39de, 0x38c2, 0x2f93, 0x3b4c, 0x39c4, 0x3b9e, 0x3844, 0x346d, 0x3bff, 0x32ce, 0x296d, 0x3130, 0x3b3d, 0x3b44, 0x369d, +0x3b13, 0x31ed, 0x330a, 0x3831, 0x34e7, 0x37b3, 0x331a, 0x3918, 0x32d3, 0x3995, 0x3991, 0x3919, 0x3a26, 0x385b, 0x2b76, 0x3a3b, 0x37f2, 0x26a7, 0x3225, 0x3b64, 0x28f0, 0x3456, 0x3822, 0x341e, 0x381a, 0x38d8, 0x2c11, 0x33be, 0x33ac, 0x353f, 0x3476, 0x3abc, +0x36ec, 0x3a1d, 0x39d3, 0x3821, 0x36ac, 0x3bce, 0x3ad2, 0x3616, 0x36a1, 0x2cb3, 0x38d2, 0x314f, 0x385c, 0x3b63, 0x3bb6, 0x2951, 0x372d, 0x2c42, 0x3823, 0x3883, 0x3872, 0x31ee, 0x36c5, 0x399a, 0x31b0, 0x3887, 0x3884, 0x3865, 0x3896, 0x36c3, 0x32e3, 0x346c, +0x3935, 0x3b50, 0x2b6d, 0x38cd, 0x388f, 0x3389, 0x395d, 0x31cd, 0x2efd, 0x3154, 0x2f35, 0x3444, 0x3293, 0x3b6b, 0x1bec, 0x3b69, 0x3bf3, 0x3611, 0x3508, 0x3742, 0x3a50, 0x3ab7, 0x3457, 0x38d3, 0x3344, 0x38e8, 0x33c0, 0x3668, 0x3bee, 0x3b21, 0x3727, 0x3121, +0x316c, 0x3288, 0x2d50, 0x2e74, 0x35d5, 0x37e2, 0x303d, 0x36af, 0x341f, 0x3436, 0x2df7, 0x399d, 0x30f4, 0x3aaf, 0x34e4, 0x2c2a, 0x3116, 0x34d3, 0x36ac, 0x35e3, 0x3760, 0x36e1, 0x3ad2, 0x3547, 0x38f4, 0x369c, 0x3ba9, 0x34f0, 0x3a39, 0x3b19, 0x36e6, 0x395d, +0x3be8, 0x3293, 0x3bfc, 0x3435, 0x2eb3, 0x3360, 0x3919, 0x3bed, 0x396a, 0x37fc, 0x3242, 0x384b, 0x38cb, 0x3b2c, 0x3b28, 0x28cf, 0x3828, 0x3855, 0x3ba9, 0x2fa7, 0x340b, 0x32f1, 0x3ada, 0x36fa, 0x31f5, 0x3436, 0x29d0, 0x33e6, 0x3232, 0x3bec, 0x3904, 0x2797, +0x3b81, 0x3bac, 0x38d2, 0x343d, 0x31af, 0x3b1e, 0x33fc, 0x3864, 0x3624, 0x3905, 0x2945, 0x3b52, 0x2d08, 0x3a17, 0x3b84, 0x3804, 0x3a24, 0x38a3, 0x3562, 0x3ae6, 0x3bba, 0x3a45, 0x3679, 0x31fa, 0x3994, 0x2c3d, 0x383f, 0x399d, 0x34f7, 0x360e, 0x35f3, 0x38f0, +0x38d4, 0x399a, 0x3a48, 0x3987, 0x3b54, 0x382c, 0x3210, 0x35ef, 0x36ca, 0x31b4, 0x3625, 0x371f, 0x37bd, 0x3680, 0x3a3a, 0x3ac0, 0x3bbf, 0x3bf5, 0x39f2, 0x29c2, 0x363e, 0x3a4e, 0x3596, 0x3b1b, 0x3459, 0x3669, 0x3aa1, 0x39c3, 0x3376, 0x390d, 0x2456, 0x39b5, +0x3a66, 0x3ad8, 0x3b51, 0x36aa, 0x32be, 0x3ac8, 0x392b, 0x3740, 0x3a48, 0x38f5, 0x3b2d, 0x3a5f, 0x2ff3, 0x366f, 0x39d3, 0x35e5, 0x3822, 0x38db, 0x3b8a, 0x34be, 0x2d33, 0x36dd, 0x3578, 0x3bdf, 0x2c7e, 0x39cf, 0x32ff, 0x35c9, 0x3970, 0x3bcb, 0x351e, 0x3956, +0x2c42, 0x3308, 0x377a, 0x361c, 0x39a0, 0x36c9, 0x2dcb, 0x3bf2, 0x3b5f, 0x33ee, 0x24c1, 0x2ce9, 0x3927, 0x305d, 0x3702, 0x3119, 0x35f9, 0x3855, 0x3374, 0x349b, 0x3bcf, 0x2dea, 0x34f0, 0x363f, 0x37da, 0x3a74, 0x35fc, 0x35fa, 0x316b, 0x3804, 0x37a7, 0x3986, +0x3073, 0x3aed, 0x31c7, 0x3844, 0x34a4, 0x387d, 0x3a20, 0x3037, 0x3a00, 0x3b70, 0x377f, 0x3686, 0x3b7e, 0x38b3, 0x32e3, 0x3323, 0x391e, 0x3228, 0x3930, 0x3997, 0x3a5e, 0x398b, 0x3512, 0x35b0, 0x365c, 0x325d, 0x3b61, 0x38b8, 0x39a4, 0x3423, 0x3bd7, 0x38af, +0x2d3d, 0x382d, 0x38ac, 0x26ca, 0x395e, 0x21a8, 0x3520, 0x386f, 0x3b95, 0x32c0, 0x3b84, 0x3a51, 0x3b4b, 0x31d2, 0x3747, 0x3b96, 0x3b40, 0x3535, 0x38d1, 0x3899, 0x3b00, 0x3827, 0x3ae3, 0x38c8, 0x3a07, 0x338d, 0x2e96, 0x3a46, 0x394a, 0x39de, 0x2951, 0x3a02, +0x3838, 0x2d45, 0x28c0, 0x3958, 0x3070, 0x2aa2, 0x3510, 0x38ce, 0x271c, 0x3440, 0x3954, 0x30bc, 0x3b35, 0x2f1d, 0x3afb, 0x2dae, 0x356f, 0x2e13, 0x3981, 0x326d, 0x3a28, 0x3a36, 0x3a95, 0x38cb, 0x38db, 0x3150, 0x2c9e, 0x34c5, 0x3adb, 0x3bdf, 0x38f2, 0x3994, +0x36f8, 0x31c0, 0x3a4f, 0x3825, 0x394b, 0x3a8b, 0x38ac, 0x3167, 0x2e2d, 0x3a93, 0x34f3, 0x37bd, 0x3b63, 0x2f2f, 0x3ae0, 0x3ad8, 0x34a8, 0x2e1c, 0x3890, 0x3705, 0x3b69, 0x3bc1, 0x28af, 0x3b36, 0x348b, 0x3111, 0x3a8d, 0x389c, 0x3916, 0x36dc, 0x3bae, 0x3874, +0x3593, 0x3638, 0x3018, 0x3a56, 0x38a3, 0x2ad4, 0x3a25, 0x38d7, 0x3864, 0x31c1, 0x28d1, 0x39c8, 0x37d6, 0x2c7f, 0x3ba5, 0x34b8, 0x3bef, 0x3b83, 0x3ab5, 0x3062, 0x38bc, 0x399c, 0x2ce4, 0x2f2c, 0x39bf, 0x2ed1, 0x385f, 0x37e0, 0x35ee, 0x397d, 0x3b0c, 0x3049, +0x39d5, 0x322e, 0x3936, 0x3747, 0x2e15, 0x3b41, 0x3874, 0x3bd0, 0x2c04, 0x3800, 0x375b, 0x3b2d, 0x38d8, 0x3a51, 0x3406, 0x38da, 0x38ba, 0x3497, 0x382e, 0x35fc, 0x39d4, 0x3775, 0x3b1e, 0x3813, 0x3649, 0x31af, 0x37bb, 0x334a, 0x3a6e, 0x3284, 0x26e0, 0x2e01, +0x2ebb, 0x344b, 0x3821, 0x381a, 0x385a, 0x2534, 0x3635, 0x2a92, 0x3b8c, 0x31f0, 0x3947, 0x3ac7, 0x3743, 0x3924, 0x39e4, 0x358f, 0x2b62, 0x392c, 0x3955, 0x3341, 0x3676, 0x38ac, 0x3957, 0x335b, 0x2ca2, 0x39ff, 0x37cb, 0x341f, 0x3ac9, 0x3b6c, 0x2f14, 0x34c3, +0x3018, 0x3169, 0x355b, 0x3624, 0x31ed, 0x379e, 0x3268, 0x309b, 0x35db, 0x3872, 0x3bdb, 0x34c7, 0x3408, 0x3359, 0x3920, 0x331f, 0x3866, 0x3af0, 0x2a1a, 0x39e0, 0x3b14, 0x34fa, 0x2d18, 0x3963, 0x35e8, 0x2539, 0x38f5, 0x37b3, 0x378f, 0x31b5, 0x3a6c, 0x3685, +0x3a06, 0x318a, 0x2934, 0x33c1, 0x3be8, 0x375b, 0x3860, 0x3543, 0x3702, 0x3951, 0x3677, 0x37ff, 0x2e27, 0x2e3a, 0x340f, 0x3817, 0x2f04, 0x357e, 0x3a1d, 0x2dd6, 0x252a, 0x3945, 0x162a, 0x3b19, 0x3a53, 0x35d2, 0x3a5d, 0x3474, 0x38e9, 0x374b, 0x387c, 0x1f1a, +0x38ac, 0x3291, 0x3393, 0x3b53, 0x3169, 0x3bca, 0x2f1a, 0x3551, 0x38a3, 0x28e3, 0x369d, 0x34a1, 0x38a8, 0x34c3, 0x3841, 0x390d, 0x3b13, 0x3282, 0x3a29, 0x3a78, 0x2df3, 0x3a37, 0x35f4, 0x35a6, 0x38e8, 0x3328, 0x3beb, 0x390b, 0x32dc, 0x34dc, 0x396d, 0x3a78, +0x39ba, 0x3a06, 0x2cdd, 0x3bc3, 0x2d43, 0x2992, 0x3663, 0x3a68, 0x2c3e, 0x394e, 0x2c9f, 0x380e, 0x37f5, 0x3557, 0x2873, 0x390f, 0x39e7, 0x3939, 0x3669, 0x385c, 0x3a68, 0x32c4, 0x2b04, 0x2d6d, 0x39d3, 0x3895, 0x331d, 0x3b59, 0x3463, 0x2b6a, 0x31de, 0x3296, +0x3aae, 0x3bcd, 0x345a, 0x3897, 0x374b, 0x3bd4, 0x38a2, 0x357f, 0x3402, 0x3a0c, 0x3507, 0x3865, 0x3a54, 0x3878, 0x3859, 0x383e, 0x32b5, 0x34ea, 0x328d, 0x38b6, 0x3464, 0x2f5b, 0x35ff, 0x3817, 0x2f24, 0x3533, 0x3b21, 0x37ba, 0x3837, 0x2e34, 0x3bad, 0x34bc +}; \ No newline at end of file diff --git a/hwpe/redmule_256iter/inc/x_input.h b/hwpe/redmule_256iter/inc/x_input.h new file mode 100644 index 0000000..1e38d23 --- /dev/null +++ b/hwpe/redmule_256iter/inc/x_input.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t x_inp [768] = { +0x2153, 0x3bb5, 0x3896, 0x365f, 0x2483, 0x3518, 0x2dd1, 0x3bca, 0x397b, 0x29b1, 0x3705, 0x36c8, 0x398b, 0x3661, 0x2f05, 0x365a, 0x3bf9, 0x34df, 0x363b, 0x38d9, 0x39c6, 0x3abb, 0x3952, 0x38f2, 0x392d, 0x3b3e, 0x2afb, 0x3a9d, 0x353b, 0x3b73, 0x3a01, 0x3679, +0x3934, 0x397d, 0x2904, 0x3822, 0x3462, 0x3b44, 0x39e9, 0x28be, 0x331e, 0x3a1d, 0x39e5, 0x34da, 0x3a19, 0x3906, 0x1d35, 0x3871, 0x31e7, 0x3b29, 0x325d, 0x3797, 0x2b2f, 0x38b4, 0x232f, 0x38aa, 0x3aca, 0x316f, 0x3811, 0x3950, 0x32ea, 0x3bc7, 0x382c, 0x38a2, +0x29ce, 0x3afa, 0x3a39, 0x2ccc, 0x39fd, 0x3b3d, 0x384a, 0x3a35, 0x3802, 0x366a, 0x37ec, 0x3598, 0x3bf8, 0x3a85, 0x3a1b, 0x386e, 0x3b4c, 0x39de, 0x38c2, 0x2f93, 0x3b4c, 0x39c4, 0x3b9e, 0x3844, 0x346d, 0x3bff, 0x32ce, 0x296d, 0x3130, 0x3b3d, 0x3b44, 0x369d, +0x3b13, 0x31ed, 0x330a, 0x3831, 0x34e7, 0x37b3, 0x331a, 0x3918, 0x32d3, 0x3995, 0x3991, 0x3919, 0x3a26, 0x385b, 0x2b76, 0x3a3b, 0x37f2, 0x26a7, 0x3225, 0x3b64, 0x28f0, 0x3456, 0x3822, 0x341e, 0x381a, 0x38d8, 0x2c11, 0x33be, 0x33ac, 0x353f, 0x3476, 0x3abc, +0x36ec, 0x3a1d, 0x39d3, 0x3821, 0x36ac, 0x3bce, 0x3ad2, 0x3616, 0x36a1, 0x2cb3, 0x38d2, 0x314f, 0x385c, 0x3b63, 0x3bb6, 0x2951, 0x372d, 0x2c42, 0x3823, 0x3883, 0x3872, 0x31ee, 0x36c5, 0x399a, 0x31b0, 0x3887, 0x3884, 0x3865, 0x3896, 0x36c3, 0x32e3, 0x346c, +0x3935, 0x3b50, 0x2b6d, 0x38cd, 0x388f, 0x3389, 0x395d, 0x31cd, 0x2efd, 0x3154, 0x2f35, 0x3444, 0x3293, 0x3b6b, 0x1bec, 0x3b69, 0x3bf3, 0x3611, 0x3508, 0x3742, 0x3a50, 0x3ab7, 0x3457, 0x38d3, 0x3344, 0x38e8, 0x33c0, 0x3668, 0x3bee, 0x3b21, 0x3727, 0x3121, +0x316c, 0x3288, 0x2d50, 0x2e74, 0x35d5, 0x37e2, 0x303d, 0x36af, 0x341f, 0x3436, 0x2df7, 0x399d, 0x30f4, 0x3aaf, 0x34e4, 0x2c2a, 0x3116, 0x34d3, 0x36ac, 0x35e3, 0x3760, 0x36e1, 0x3ad2, 0x3547, 0x38f4, 0x369c, 0x3ba9, 0x34f0, 0x3a39, 0x3b19, 0x36e6, 0x395d, +0x3be8, 0x3293, 0x3bfc, 0x3435, 0x2eb3, 0x3360, 0x3919, 0x3bed, 0x396a, 0x37fc, 0x3242, 0x384b, 0x38cb, 0x3b2c, 0x3b28, 0x28cf, 0x3828, 0x3855, 0x3ba9, 0x2fa7, 0x340b, 0x32f1, 0x3ada, 0x36fa, 0x31f5, 0x3436, 0x29d0, 0x33e6, 0x3232, 0x3bec, 0x3904, 0x2797, +0x3b81, 0x3bac, 0x38d2, 0x343d, 0x31af, 0x3b1e, 0x33fc, 0x3864, 0x3624, 0x3905, 0x2945, 0x3b52, 0x2d08, 0x3a17, 0x3b84, 0x3804, 0x3a24, 0x38a3, 0x3562, 0x3ae6, 0x3bba, 0x3a45, 0x3679, 0x31fa, 0x3994, 0x2c3d, 0x383f, 0x399d, 0x34f7, 0x360e, 0x35f3, 0x38f0, +0x38d4, 0x399a, 0x3a48, 0x3987, 0x3b54, 0x382c, 0x3210, 0x35ef, 0x36ca, 0x31b4, 0x3625, 0x371f, 0x37bd, 0x3680, 0x3a3a, 0x3ac0, 0x3bbf, 0x3bf5, 0x39f2, 0x29c2, 0x363e, 0x3a4e, 0x3596, 0x3b1b, 0x3459, 0x3669, 0x3aa1, 0x39c3, 0x3376, 0x390d, 0x2456, 0x39b5, +0x3a66, 0x3ad8, 0x3b51, 0x36aa, 0x32be, 0x3ac8, 0x392b, 0x3740, 0x3a48, 0x38f5, 0x3b2d, 0x3a5f, 0x2ff3, 0x366f, 0x39d3, 0x35e5, 0x3822, 0x38db, 0x3b8a, 0x34be, 0x2d33, 0x36dd, 0x3578, 0x3bdf, 0x2c7e, 0x39cf, 0x32ff, 0x35c9, 0x3970, 0x3bcb, 0x351e, 0x3956, +0x2c42, 0x3308, 0x377a, 0x361c, 0x39a0, 0x36c9, 0x2dcb, 0x3bf2, 0x3b5f, 0x33ee, 0x24c1, 0x2ce9, 0x3927, 0x305d, 0x3702, 0x3119, 0x35f9, 0x3855, 0x3374, 0x349b, 0x3bcf, 0x2dea, 0x34f0, 0x363f, 0x37da, 0x3a74, 0x35fc, 0x35fa, 0x316b, 0x3804, 0x37a7, 0x3986, +0x3073, 0x3aed, 0x31c7, 0x3844, 0x34a4, 0x387d, 0x3a20, 0x3037, 0x3a00, 0x3b70, 0x377f, 0x3686, 0x3b7e, 0x38b3, 0x32e3, 0x3323, 0x391e, 0x3228, 0x3930, 0x3997, 0x3a5e, 0x398b, 0x3512, 0x35b0, 0x365c, 0x325d, 0x3b61, 0x38b8, 0x39a4, 0x3423, 0x3bd7, 0x38af, +0x2d3d, 0x382d, 0x38ac, 0x26ca, 0x395e, 0x21a8, 0x3520, 0x386f, 0x3b95, 0x32c0, 0x3b84, 0x3a51, 0x3b4b, 0x31d2, 0x3747, 0x3b96, 0x3b40, 0x3535, 0x38d1, 0x3899, 0x3b00, 0x3827, 0x3ae3, 0x38c8, 0x3a07, 0x338d, 0x2e96, 0x3a46, 0x394a, 0x39de, 0x2951, 0x3a02, +0x3838, 0x2d45, 0x28c0, 0x3958, 0x3070, 0x2aa2, 0x3510, 0x38ce, 0x271c, 0x3440, 0x3954, 0x30bc, 0x3b35, 0x2f1d, 0x3afb, 0x2dae, 0x356f, 0x2e13, 0x3981, 0x326d, 0x3a28, 0x3a36, 0x3a95, 0x38cb, 0x38db, 0x3150, 0x2c9e, 0x34c5, 0x3adb, 0x3bdf, 0x38f2, 0x3994, +0x36f8, 0x31c0, 0x3a4f, 0x3825, 0x394b, 0x3a8b, 0x38ac, 0x3167, 0x2e2d, 0x3a93, 0x34f3, 0x37bd, 0x3b63, 0x2f2f, 0x3ae0, 0x3ad8, 0x34a8, 0x2e1c, 0x3890, 0x3705, 0x3b69, 0x3bc1, 0x28af, 0x3b36, 0x348b, 0x3111, 0x3a8d, 0x389c, 0x3916, 0x36dc, 0x3bae, 0x3874, +0x3593, 0x3638, 0x3018, 0x3a56, 0x38a3, 0x2ad4, 0x3a25, 0x38d7, 0x3864, 0x31c1, 0x28d1, 0x39c8, 0x37d6, 0x2c7f, 0x3ba5, 0x34b8, 0x3bef, 0x3b83, 0x3ab5, 0x3062, 0x38bc, 0x399c, 0x2ce4, 0x2f2c, 0x39bf, 0x2ed1, 0x385f, 0x37e0, 0x35ee, 0x397d, 0x3b0c, 0x3049, +0x39d5, 0x322e, 0x3936, 0x3747, 0x2e15, 0x3b41, 0x3874, 0x3bd0, 0x2c04, 0x3800, 0x375b, 0x3b2d, 0x38d8, 0x3a51, 0x3406, 0x38da, 0x38ba, 0x3497, 0x382e, 0x35fc, 0x39d4, 0x3775, 0x3b1e, 0x3813, 0x3649, 0x31af, 0x37bb, 0x334a, 0x3a6e, 0x3284, 0x26e0, 0x2e01, +0x2ebb, 0x344b, 0x3821, 0x381a, 0x385a, 0x2534, 0x3635, 0x2a92, 0x3b8c, 0x31f0, 0x3947, 0x3ac7, 0x3743, 0x3924, 0x39e4, 0x358f, 0x2b62, 0x392c, 0x3955, 0x3341, 0x3676, 0x38ac, 0x3957, 0x335b, 0x2ca2, 0x39ff, 0x37cb, 0x341f, 0x3ac9, 0x3b6c, 0x2f14, 0x34c3, +0x3018, 0x3169, 0x355b, 0x3624, 0x31ed, 0x379e, 0x3268, 0x309b, 0x35db, 0x3872, 0x3bdb, 0x34c7, 0x3408, 0x3359, 0x3920, 0x331f, 0x3866, 0x3af0, 0x2a1a, 0x39e0, 0x3b14, 0x34fa, 0x2d18, 0x3963, 0x35e8, 0x2539, 0x38f5, 0x37b3, 0x378f, 0x31b5, 0x3a6c, 0x3685, +0x3a06, 0x318a, 0x2934, 0x33c1, 0x3be8, 0x375b, 0x3860, 0x3543, 0x3702, 0x3951, 0x3677, 0x37ff, 0x2e27, 0x2e3a, 0x340f, 0x3817, 0x2f04, 0x357e, 0x3a1d, 0x2dd6, 0x252a, 0x3945, 0x162a, 0x3b19, 0x3a53, 0x35d2, 0x3a5d, 0x3474, 0x38e9, 0x374b, 0x387c, 0x1f1a, +0x38ac, 0x3291, 0x3393, 0x3b53, 0x3169, 0x3bca, 0x2f1a, 0x3551, 0x38a3, 0x28e3, 0x369d, 0x34a1, 0x38a8, 0x34c3, 0x3841, 0x390d, 0x3b13, 0x3282, 0x3a29, 0x3a78, 0x2df3, 0x3a37, 0x35f4, 0x35a6, 0x38e8, 0x3328, 0x3beb, 0x390b, 0x32dc, 0x34dc, 0x396d, 0x3a78, +0x39ba, 0x3a06, 0x2cdd, 0x3bc3, 0x2d43, 0x2992, 0x3663, 0x3a68, 0x2c3e, 0x394e, 0x2c9f, 0x380e, 0x37f5, 0x3557, 0x2873, 0x390f, 0x39e7, 0x3939, 0x3669, 0x385c, 0x3a68, 0x32c4, 0x2b04, 0x2d6d, 0x39d3, 0x3895, 0x331d, 0x3b59, 0x3463, 0x2b6a, 0x31de, 0x3296, +0x3aae, 0x3bcd, 0x345a, 0x3897, 0x374b, 0x3bd4, 0x38a2, 0x357f, 0x3402, 0x3a0c, 0x3507, 0x3865, 0x3a54, 0x3878, 0x3859, 0x383e, 0x32b5, 0x34ea, 0x328d, 0x38b6, 0x3464, 0x2f5b, 0x35ff, 0x3817, 0x2f24, 0x3533, 0x3b21, 0x37ba, 0x3837, 0x2e34, 0x3bad, 0x34bc +}; \ No newline at end of file diff --git a/hwpe/redmule_256iter/inc/y_2D.h b/hwpe/redmule_256iter/inc/y_2D.h new file mode 100644 index 0000000..9484a10 --- /dev/null +++ b/hwpe/redmule_256iter/inc/y_2D.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t y_inp_2D [32][32] = { +0x3150, 0x2dc1, 0x3033, 0x31f5, 0x3bb6, 0x3bff, 0x39f9, 0x3662, 0x3720, 0x351d, 0x384b, 0x3093, 0x3b9d, 0x35ad, 0x3695, 0x3466, 0x2300, 0x3445, 0x33ae, 0x3586, 0x38a3, 0x3bdb, 0x33a2, 0x379b, 0x3a0e, 0x38b0, 0x39ba, 0x379b, 0x39d3, 0x3a51, 0x3b30, 0x3794, +0x3b76, 0x3042, 0x38cc, 0x2dfc, 0x3b1a, 0x37fb, 0x38f7, 0x3824, 0x386f, 0x38c7, 0x36ee, 0x3a9c, 0x38d3, 0x2c67, 0x3a80, 0x2f30, 0x3328, 0x3721, 0x3790, 0x34e5, 0x3a6c, 0x3643, 0x3934, 0x3034, 0x38d4, 0x362e, 0x3b4b, 0x3408, 0x30c2, 0x370e, 0x3b31, 0x3b16, +0x3b6b, 0x39d4, 0x339c, 0x381e, 0x313e, 0x3671, 0x3ae2, 0x3479, 0x3940, 0x342d, 0x3925, 0x370a, 0x35d8, 0x2dad, 0x3888, 0x24b9, 0x375d, 0x34bd, 0x3243, 0x2ebb, 0x3970, 0x3a21, 0x3a07, 0x3877, 0x3888, 0x3569, 0x372d, 0x2ac1, 0x331e, 0x384d, 0x3996, 0x34a4, +0x35c1, 0x33a9, 0x21ed, 0x3a42, 0x388d, 0x34e4, 0x33c3, 0x34f9, 0x3a7b, 0x33fb, 0x2cdd, 0x3b0e, 0x333b, 0x3973, 0x34fc, 0x3771, 0x32ea, 0x2de4, 0x31a8, 0x3946, 0x3657, 0x3a4e, 0x36f6, 0x2829, 0x3ba2, 0x3bdc, 0x3bb3, 0x306c, 0x398d, 0x3a1f, 0x3991, 0x3846, +0x3547, 0x3292, 0x2e85, 0x31ed, 0x3979, 0x3a90, 0x28a4, 0x3bed, 0x36d8, 0x340e, 0x3b6a, 0x3ab6, 0x3824, 0x382b, 0x3ac3, 0x3811, 0x36d7, 0x3519, 0x3a92, 0x3a42, 0x29d1, 0x383a, 0x3a9b, 0x300e, 0x2cd3, 0x39cd, 0x3874, 0x3a07, 0x2eb1, 0x3b86, 0x3ad8, 0x3a5d, +0x3712, 0x284a, 0x38c1, 0x3bec, 0x39c0, 0x32cd, 0x3ad8, 0x3bce, 0x3817, 0x3896, 0x3aa7, 0x3870, 0x3996, 0x32cc, 0x3a4c, 0x3757, 0x3814, 0x3b65, 0x3acb, 0x376e, 0x34c0, 0x3609, 0x3bf0, 0x3b24, 0x3b29, 0x3848, 0x34b7, 0x398a, 0x220c, 0x3498, 0x3a8c, 0x3883, +0x38c4, 0x3af6, 0x3a42, 0x2dd6, 0x3147, 0x3717, 0x3a8e, 0x3af9, 0x3296, 0x38ef, 0x34fa, 0x3555, 0x3b29, 0x38de, 0x315e, 0x3773, 0x3b67, 0x3116, 0x38ec, 0x357c, 0x35d0, 0x2518, 0x3958, 0x2a03, 0x37d9, 0x3699, 0x3a1e, 0x3230, 0x3b13, 0x36d4, 0x3b2a, 0x39ad, +0x3b10, 0x351a, 0x3b97, 0x3326, 0x2b54, 0x3b7d, 0x386f, 0x373e, 0x37fa, 0x389b, 0x3b90, 0x3292, 0x3975, 0x38f3, 0x37f1, 0x3590, 0x3810, 0x2fd7, 0x3bf7, 0x3a5a, 0x3a1c, 0x34dd, 0x354c, 0x32f8, 0x3095, 0x321e, 0x39e0, 0x395c, 0x3717, 0x357f, 0x394a, 0x34b1, +0x3ba4, 0x380c, 0x3604, 0x2f50, 0x348d, 0x3828, 0x3a9f, 0x39ce, 0x32ca, 0x3906, 0x3ab2, 0x2ca5, 0x38c9, 0x362a, 0x34b2, 0x29dc, 0x3a36, 0x3052, 0x31b7, 0x3589, 0x387c, 0x3401, 0x3b22, 0x3ad6, 0x3ae8, 0x3238, 0x3494, 0x3502, 0x3717, 0x3a6c, 0x3229, 0x368c, +0x3056, 0x3a56, 0x3498, 0x39eb, 0x2864, 0x342d, 0x39e0, 0x34a1, 0x2b99, 0x3a04, 0x38ff, 0x328c, 0x34d9, 0x387d, 0x3a3c, 0x32e5, 0x39eb, 0x3984, 0x34dd, 0x38a7, 0x373f, 0x39b4, 0x3235, 0x2f58, 0x2f39, 0x3800, 0x3758, 0x3939, 0x39fc, 0x3a4b, 0x38bf, 0x30ee, +0x345e, 0x39c8, 0x3a6d, 0x3262, 0x3b81, 0x31dc, 0x3a15, 0x3bd0, 0x36af, 0x36de, 0x37d5, 0x39d7, 0x3ad3, 0x3ac1, 0x3109, 0x35ea, 0x31c6, 0x398d, 0x3987, 0x3a4a, 0x34d2, 0x2ed2, 0x35e6, 0x352c, 0x39eb, 0x3bd6, 0x3a5b, 0x39d1, 0x34aa, 0x3ade, 0x394b, 0x38a1, +0x2bed, 0x38de, 0x3811, 0x3813, 0x391a, 0x374b, 0x3829, 0x3725, 0x38f0, 0x3583, 0x3966, 0x3a7d, 0x375a, 0x38fe, 0x3696, 0x361c, 0x39a8, 0x35f0, 0x38e1, 0x3003, 0x3595, 0x316e, 0x3862, 0x3af8, 0x3af2, 0x34c8, 0x381d, 0x37d8, 0x3893, 0x3a9c, 0x3989, 0x308c, +0x30cc, 0x2538, 0x399d, 0x3919, 0x399e, 0x21cc, 0x38e9, 0x30f8, 0x3a20, 0x3b3c, 0x3990, 0x259c, 0x3143, 0x3080, 0x3967, 0x3afb, 0x3a1b, 0x3779, 0x2eeb, 0x39f3, 0x379a, 0x369c, 0x3985, 0x3a1b, 0x3ba6, 0x3a53, 0x28d5, 0x3881, 0x31d9, 0x3a34, 0x3bd9, 0x393a, +0x3601, 0x2c6e, 0x3636, 0x3298, 0x39bb, 0x3a08, 0x38db, 0x35ad, 0x3a09, 0x36a6, 0x3bc7, 0x3bac, 0x34ae, 0x3291, 0x290b, 0x3250, 0x2648, 0x333d, 0x2bf3, 0x34b1, 0x30e0, 0x351f, 0x3a74, 0x38dc, 0x3883, 0x2841, 0x35e1, 0x390d, 0x3a50, 0x3abd, 0x386d, 0x3bb7, +0x3b94, 0x36b7, 0x3a49, 0x332f, 0x3a1d, 0x354b, 0x3bab, 0x3346, 0x3417, 0x351e, 0x3b6d, 0x391a, 0x2db3, 0x3b1c, 0x3a4a, 0x37b7, 0x36cf, 0x3a56, 0x39c4, 0x3be9, 0x34f0, 0x39be, 0x3691, 0x1ba5, 0x3888, 0x3040, 0x3ae1, 0x3b9b, 0x398f, 0x3a49, 0x3a16, 0x38c0, +0x386c, 0x39ab, 0x37fa, 0x382c, 0x3a6f, 0x393f, 0x340d, 0x38ef, 0x39d1, 0x3845, 0x398f, 0x363e, 0x3687, 0x3052, 0x3a2b, 0x392c, 0x2f5c, 0x3412, 0x3a1f, 0x3b2f, 0x3bcc, 0x3a63, 0x3a89, 0x36e9, 0x3921, 0x3b80, 0x2dc0, 0x3a03, 0x3beb, 0x38d3, 0x36cb, 0x39a3, +0x3978, 0x3a88, 0x3ba4, 0x3561, 0x28c5, 0x33a0, 0x37be, 0x2c39, 0x30ee, 0x3782, 0x2c07, 0x354e, 0x3491, 0x3a92, 0x331a, 0x3b15, 0x32e1, 0x3839, 0x3afb, 0x36c2, 0x2fd0, 0x29ad, 0x3b2e, 0x39c1, 0x2a8c, 0x341a, 0x2f90, 0x395a, 0x3969, 0x37ea, 0x3a5c, 0x3b6d, +0x3971, 0x3a93, 0x304e, 0x3623, 0x3a22, 0x31ee, 0x29df, 0x2c93, 0x3a01, 0x3a62, 0x366c, 0x371d, 0x3af3, 0x2e08, 0x3ac0, 0x3642, 0x3a28, 0x368d, 0x2d3d, 0x36d9, 0x32c3, 0x373f, 0x36fe, 0x3487, 0x2c81, 0x3623, 0x3b59, 0x3a91, 0x350a, 0x34f4, 0x3b09, 0x2c25, +0x3b13, 0x325a, 0x379e, 0x3a7d, 0x34b1, 0x39d5, 0x2ba8, 0x322b, 0x3b5e, 0x37ab, 0x2e24, 0x3ba9, 0x3a3d, 0x34f7, 0x3ba1, 0x3877, 0x3071, 0x39fb, 0x3bbd, 0x3633, 0x3b36, 0x2daa, 0x3b9b, 0x3aa0, 0x395c, 0x3b8f, 0x38d5, 0x3ab0, 0x3a8f, 0x36c2, 0x3b1f, 0x3489, +0x2acc, 0x3845, 0x3715, 0x37d8, 0x3992, 0x3bff, 0x350e, 0x3ad7, 0x39b0, 0x35ac, 0x3287, 0x385f, 0x3bd4, 0x37a3, 0x3438, 0x39a5, 0x3bcf, 0x38c3, 0x34f6, 0x3ae3, 0x3b57, 0x39af, 0x35eb, 0x3bed, 0x34d4, 0x2a95, 0x3b13, 0x384e, 0x3a3b, 0x33da, 0x3bce, 0x3b99, +0x3559, 0x3335, 0x3a2e, 0x3123, 0x38db, 0x33d0, 0x3638, 0x3b17, 0x3a72, 0x3afc, 0x3936, 0x3838, 0x2b69, 0x3895, 0x3a1a, 0x3192, 0x39d5, 0x37a5, 0x2eb0, 0x2e8b, 0x329a, 0x3b90, 0x390a, 0x3a1e, 0x3847, 0x375d, 0x3873, 0x35e2, 0x3771, 0x30f5, 0x3231, 0x3bd7, +0x2bbc, 0x3ace, 0x31ad, 0x3a6b, 0x28a4, 0x3b48, 0x3ba3, 0x3a84, 0x3353, 0x39f6, 0x381f, 0x2dd6, 0x314c, 0x34af, 0x3929, 0x3921, 0x383b, 0x34b0, 0x3923, 0x32c9, 0x3ae7, 0x318f, 0x3480, 0x2ad8, 0x3042, 0x3a4c, 0x349d, 0x2c12, 0x3abb, 0x3a57, 0x3b0d, 0x3111, +0x3359, 0x3a84, 0x38f2, 0x368d, 0x2f4b, 0x3ba0, 0x395c, 0x3026, 0x3a15, 0x2a04, 0x326e, 0x3522, 0x31a2, 0x382f, 0x2ada, 0x3b7c, 0x2f80, 0x3af5, 0x2d35, 0x38fa, 0x39ab, 0x2c6d, 0x2e7a, 0x39f6, 0x31a4, 0x3a53, 0x358c, 0x3951, 0x3a4e, 0x3916, 0x2a3f, 0x3ae9, +0x3b03, 0x39f8, 0x39fe, 0x3a61, 0x39fb, 0x3704, 0x360d, 0x39a7, 0x37a9, 0x348f, 0x3a30, 0x3af5, 0x366f, 0x3b29, 0x3a6a, 0x33d5, 0x370a, 0x39cd, 0x3444, 0x3bea, 0x3b2b, 0x312e, 0x3b8e, 0x32cf, 0x3b79, 0x3302, 0x3bba, 0x3962, 0x3413, 0x37a1, 0x39e0, 0x3805 +}; \ No newline at end of file diff --git a/hwpe/redmule_256iter/inc/y_input.h b/hwpe/redmule_256iter/inc/y_input.h new file mode 100644 index 0000000..45a2375 --- /dev/null +++ b/hwpe/redmule_256iter/inc/y_input.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t y_inp [768] = { +0x3150, 0x2dc1, 0x3033, 0x31f5, 0x3bb6, 0x3bff, 0x39f9, 0x3662, 0x3720, 0x351d, 0x384b, 0x3093, 0x3b9d, 0x35ad, 0x3695, 0x3466, 0x2300, 0x3445, 0x33ae, 0x3586, 0x38a3, 0x3bdb, 0x33a2, 0x379b, 0x3a0e, 0x38b0, 0x39ba, 0x379b, 0x39d3, 0x3a51, 0x3b30, 0x3794, +0x3b76, 0x3042, 0x38cc, 0x2dfc, 0x3b1a, 0x37fb, 0x38f7, 0x3824, 0x386f, 0x38c7, 0x36ee, 0x3a9c, 0x38d3, 0x2c67, 0x3a80, 0x2f30, 0x3328, 0x3721, 0x3790, 0x34e5, 0x3a6c, 0x3643, 0x3934, 0x3034, 0x38d4, 0x362e, 0x3b4b, 0x3408, 0x30c2, 0x370e, 0x3b31, 0x3b16, +0x3b6b, 0x39d4, 0x339c, 0x381e, 0x313e, 0x3671, 0x3ae2, 0x3479, 0x3940, 0x342d, 0x3925, 0x370a, 0x35d8, 0x2dad, 0x3888, 0x24b9, 0x375d, 0x34bd, 0x3243, 0x2ebb, 0x3970, 0x3a21, 0x3a07, 0x3877, 0x3888, 0x3569, 0x372d, 0x2ac1, 0x331e, 0x384d, 0x3996, 0x34a4, +0x35c1, 0x33a9, 0x21ed, 0x3a42, 0x388d, 0x34e4, 0x33c3, 0x34f9, 0x3a7b, 0x33fb, 0x2cdd, 0x3b0e, 0x333b, 0x3973, 0x34fc, 0x3771, 0x32ea, 0x2de4, 0x31a8, 0x3946, 0x3657, 0x3a4e, 0x36f6, 0x2829, 0x3ba2, 0x3bdc, 0x3bb3, 0x306c, 0x398d, 0x3a1f, 0x3991, 0x3846, +0x3547, 0x3292, 0x2e85, 0x31ed, 0x3979, 0x3a90, 0x28a4, 0x3bed, 0x36d8, 0x340e, 0x3b6a, 0x3ab6, 0x3824, 0x382b, 0x3ac3, 0x3811, 0x36d7, 0x3519, 0x3a92, 0x3a42, 0x29d1, 0x383a, 0x3a9b, 0x300e, 0x2cd3, 0x39cd, 0x3874, 0x3a07, 0x2eb1, 0x3b86, 0x3ad8, 0x3a5d, +0x3712, 0x284a, 0x38c1, 0x3bec, 0x39c0, 0x32cd, 0x3ad8, 0x3bce, 0x3817, 0x3896, 0x3aa7, 0x3870, 0x3996, 0x32cc, 0x3a4c, 0x3757, 0x3814, 0x3b65, 0x3acb, 0x376e, 0x34c0, 0x3609, 0x3bf0, 0x3b24, 0x3b29, 0x3848, 0x34b7, 0x398a, 0x220c, 0x3498, 0x3a8c, 0x3883, +0x38c4, 0x3af6, 0x3a42, 0x2dd6, 0x3147, 0x3717, 0x3a8e, 0x3af9, 0x3296, 0x38ef, 0x34fa, 0x3555, 0x3b29, 0x38de, 0x315e, 0x3773, 0x3b67, 0x3116, 0x38ec, 0x357c, 0x35d0, 0x2518, 0x3958, 0x2a03, 0x37d9, 0x3699, 0x3a1e, 0x3230, 0x3b13, 0x36d4, 0x3b2a, 0x39ad, +0x3b10, 0x351a, 0x3b97, 0x3326, 0x2b54, 0x3b7d, 0x386f, 0x373e, 0x37fa, 0x389b, 0x3b90, 0x3292, 0x3975, 0x38f3, 0x37f1, 0x3590, 0x3810, 0x2fd7, 0x3bf7, 0x3a5a, 0x3a1c, 0x34dd, 0x354c, 0x32f8, 0x3095, 0x321e, 0x39e0, 0x395c, 0x3717, 0x357f, 0x394a, 0x34b1, +0x3ba4, 0x380c, 0x3604, 0x2f50, 0x348d, 0x3828, 0x3a9f, 0x39ce, 0x32ca, 0x3906, 0x3ab2, 0x2ca5, 0x38c9, 0x362a, 0x34b2, 0x29dc, 0x3a36, 0x3052, 0x31b7, 0x3589, 0x387c, 0x3401, 0x3b22, 0x3ad6, 0x3ae8, 0x3238, 0x3494, 0x3502, 0x3717, 0x3a6c, 0x3229, 0x368c, +0x3056, 0x3a56, 0x3498, 0x39eb, 0x2864, 0x342d, 0x39e0, 0x34a1, 0x2b99, 0x3a04, 0x38ff, 0x328c, 0x34d9, 0x387d, 0x3a3c, 0x32e5, 0x39eb, 0x3984, 0x34dd, 0x38a7, 0x373f, 0x39b4, 0x3235, 0x2f58, 0x2f39, 0x3800, 0x3758, 0x3939, 0x39fc, 0x3a4b, 0x38bf, 0x30ee, +0x345e, 0x39c8, 0x3a6d, 0x3262, 0x3b81, 0x31dc, 0x3a15, 0x3bd0, 0x36af, 0x36de, 0x37d5, 0x39d7, 0x3ad3, 0x3ac1, 0x3109, 0x35ea, 0x31c6, 0x398d, 0x3987, 0x3a4a, 0x34d2, 0x2ed2, 0x35e6, 0x352c, 0x39eb, 0x3bd6, 0x3a5b, 0x39d1, 0x34aa, 0x3ade, 0x394b, 0x38a1, +0x2bed, 0x38de, 0x3811, 0x3813, 0x391a, 0x374b, 0x3829, 0x3725, 0x38f0, 0x3583, 0x3966, 0x3a7d, 0x375a, 0x38fe, 0x3696, 0x361c, 0x39a8, 0x35f0, 0x38e1, 0x3003, 0x3595, 0x316e, 0x3862, 0x3af8, 0x3af2, 0x34c8, 0x381d, 0x37d8, 0x3893, 0x3a9c, 0x3989, 0x308c, +0x30cc, 0x2538, 0x399d, 0x3919, 0x399e, 0x21cc, 0x38e9, 0x30f8, 0x3a20, 0x3b3c, 0x3990, 0x259c, 0x3143, 0x3080, 0x3967, 0x3afb, 0x3a1b, 0x3779, 0x2eeb, 0x39f3, 0x379a, 0x369c, 0x3985, 0x3a1b, 0x3ba6, 0x3a53, 0x28d5, 0x3881, 0x31d9, 0x3a34, 0x3bd9, 0x393a, +0x3601, 0x2c6e, 0x3636, 0x3298, 0x39bb, 0x3a08, 0x38db, 0x35ad, 0x3a09, 0x36a6, 0x3bc7, 0x3bac, 0x34ae, 0x3291, 0x290b, 0x3250, 0x2648, 0x333d, 0x2bf3, 0x34b1, 0x30e0, 0x351f, 0x3a74, 0x38dc, 0x3883, 0x2841, 0x35e1, 0x390d, 0x3a50, 0x3abd, 0x386d, 0x3bb7, +0x3b94, 0x36b7, 0x3a49, 0x332f, 0x3a1d, 0x354b, 0x3bab, 0x3346, 0x3417, 0x351e, 0x3b6d, 0x391a, 0x2db3, 0x3b1c, 0x3a4a, 0x37b7, 0x36cf, 0x3a56, 0x39c4, 0x3be9, 0x34f0, 0x39be, 0x3691, 0x1ba5, 0x3888, 0x3040, 0x3ae1, 0x3b9b, 0x398f, 0x3a49, 0x3a16, 0x38c0, +0x386c, 0x39ab, 0x37fa, 0x382c, 0x3a6f, 0x393f, 0x340d, 0x38ef, 0x39d1, 0x3845, 0x398f, 0x363e, 0x3687, 0x3052, 0x3a2b, 0x392c, 0x2f5c, 0x3412, 0x3a1f, 0x3b2f, 0x3bcc, 0x3a63, 0x3a89, 0x36e9, 0x3921, 0x3b80, 0x2dc0, 0x3a03, 0x3beb, 0x38d3, 0x36cb, 0x39a3, +0x3978, 0x3a88, 0x3ba4, 0x3561, 0x28c5, 0x33a0, 0x37be, 0x2c39, 0x30ee, 0x3782, 0x2c07, 0x354e, 0x3491, 0x3a92, 0x331a, 0x3b15, 0x32e1, 0x3839, 0x3afb, 0x36c2, 0x2fd0, 0x29ad, 0x3b2e, 0x39c1, 0x2a8c, 0x341a, 0x2f90, 0x395a, 0x3969, 0x37ea, 0x3a5c, 0x3b6d, +0x3971, 0x3a93, 0x304e, 0x3623, 0x3a22, 0x31ee, 0x29df, 0x2c93, 0x3a01, 0x3a62, 0x366c, 0x371d, 0x3af3, 0x2e08, 0x3ac0, 0x3642, 0x3a28, 0x368d, 0x2d3d, 0x36d9, 0x32c3, 0x373f, 0x36fe, 0x3487, 0x2c81, 0x3623, 0x3b59, 0x3a91, 0x350a, 0x34f4, 0x3b09, 0x2c25, +0x3b13, 0x325a, 0x379e, 0x3a7d, 0x34b1, 0x39d5, 0x2ba8, 0x322b, 0x3b5e, 0x37ab, 0x2e24, 0x3ba9, 0x3a3d, 0x34f7, 0x3ba1, 0x3877, 0x3071, 0x39fb, 0x3bbd, 0x3633, 0x3b36, 0x2daa, 0x3b9b, 0x3aa0, 0x395c, 0x3b8f, 0x38d5, 0x3ab0, 0x3a8f, 0x36c2, 0x3b1f, 0x3489, +0x2acc, 0x3845, 0x3715, 0x37d8, 0x3992, 0x3bff, 0x350e, 0x3ad7, 0x39b0, 0x35ac, 0x3287, 0x385f, 0x3bd4, 0x37a3, 0x3438, 0x39a5, 0x3bcf, 0x38c3, 0x34f6, 0x3ae3, 0x3b57, 0x39af, 0x35eb, 0x3bed, 0x34d4, 0x2a95, 0x3b13, 0x384e, 0x3a3b, 0x33da, 0x3bce, 0x3b99, +0x3559, 0x3335, 0x3a2e, 0x3123, 0x38db, 0x33d0, 0x3638, 0x3b17, 0x3a72, 0x3afc, 0x3936, 0x3838, 0x2b69, 0x3895, 0x3a1a, 0x3192, 0x39d5, 0x37a5, 0x2eb0, 0x2e8b, 0x329a, 0x3b90, 0x390a, 0x3a1e, 0x3847, 0x375d, 0x3873, 0x35e2, 0x3771, 0x30f5, 0x3231, 0x3bd7, +0x2bbc, 0x3ace, 0x31ad, 0x3a6b, 0x28a4, 0x3b48, 0x3ba3, 0x3a84, 0x3353, 0x39f6, 0x381f, 0x2dd6, 0x314c, 0x34af, 0x3929, 0x3921, 0x383b, 0x34b0, 0x3923, 0x32c9, 0x3ae7, 0x318f, 0x3480, 0x2ad8, 0x3042, 0x3a4c, 0x349d, 0x2c12, 0x3abb, 0x3a57, 0x3b0d, 0x3111, +0x3359, 0x3a84, 0x38f2, 0x368d, 0x2f4b, 0x3ba0, 0x395c, 0x3026, 0x3a15, 0x2a04, 0x326e, 0x3522, 0x31a2, 0x382f, 0x2ada, 0x3b7c, 0x2f80, 0x3af5, 0x2d35, 0x38fa, 0x39ab, 0x2c6d, 0x2e7a, 0x39f6, 0x31a4, 0x3a53, 0x358c, 0x3951, 0x3a4e, 0x3916, 0x2a3f, 0x3ae9, +0x3b03, 0x39f8, 0x39fe, 0x3a61, 0x39fb, 0x3704, 0x360d, 0x39a7, 0x37a9, 0x348f, 0x3a30, 0x3af5, 0x366f, 0x3b29, 0x3a6a, 0x33d5, 0x370a, 0x39cd, 0x3444, 0x3bea, 0x3b2b, 0x312e, 0x3b8e, 0x32cf, 0x3b79, 0x3302, 0x3bba, 0x3962, 0x3413, 0x37a1, 0x39e0, 0x3805 +}; \ No newline at end of file diff --git a/hwpe/redmule_256iter/inc/z_2D.h b/hwpe/redmule_256iter/inc/z_2D.h new file mode 100644 index 0000000..aff808a --- /dev/null +++ b/hwpe/redmule_256iter/inc/z_2D.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t z_oup_2D [24][32] = { +0x4845, 0x4897, 0x4608, 0x4838, 0x4855, 0x487b, 0x4869, 0x4880, 0x46d1, 0x48b0, 0x48db, 0x483f, 0x48c9, 0x485f, 0x4881, 0x483a, 0x484b, 0x472c, 0x4762, 0x492b, 0x4822, 0x48fd, 0x488e, 0x492e, 0x483e, 0x484f, 0x49e8, 0x46d7, 0x484b, 0x489d, 0x490b, 0x47e9, +0x484f, 0x47d2, 0x44be, 0x4747, 0x47c7, 0x46c0, 0x4727, 0x48af, 0x46c5, 0x482d, 0x483d, 0x482e, 0x4897, 0x479f, 0x488b, 0x4749, 0x489a, 0x46a8, 0x46f2, 0x488b, 0x4891, 0x47e8, 0x4872, 0x483d, 0x4716, 0x46fd, 0x49b5, 0x46a0, 0x46e7, 0x47a4, 0x48a1, 0x4767, +0x4939, 0x4935, 0x4703, 0x48c1, 0x4863, 0x48bd, 0x4913, 0x48cf, 0x48b6, 0x48b8, 0x4946, 0x4920, 0x495e, 0x48e1, 0x4938, 0x48b2, 0x493a, 0x4882, 0x483b, 0x49d5, 0x4911, 0x4972, 0x496b, 0x49df, 0x48f2, 0x4888, 0x4a46, 0x4821, 0x48c1, 0x490c, 0x49b2, 0x48a3, +0x463a, 0x47b0, 0x44cb, 0x4762, 0x4765, 0x46b9, 0x466a, 0x4814, 0x4631, 0x4796, 0x4666, 0x474b, 0x4798, 0x4704, 0x4838, 0x4761, 0x47d3, 0x4590, 0x45ea, 0x48a2, 0x47f1, 0x4844, 0x484b, 0x4776, 0x47d6, 0x46d8, 0x48f3, 0x44d3, 0x46fa, 0x478d, 0x481e, 0x466e, +0x4827, 0x481e, 0x45a2, 0x4794, 0x4727, 0x4806, 0x475d, 0x48d5, 0x4708, 0x4828, 0x4862, 0x480d, 0x4895, 0x4832, 0x48bd, 0x47f1, 0x482a, 0x46a7, 0x47b1, 0x492d, 0x484d, 0x4884, 0x48dc, 0x485f, 0x476d, 0x480c, 0x48e9, 0x46d3, 0x4728, 0x4884, 0x48a0, 0x480e, +0x4862, 0x4813, 0x4675, 0x485a, 0x47e8, 0x4738, 0x4836, 0x4823, 0x46e7, 0x4821, 0x4822, 0x47b3, 0x4846, 0x4855, 0x4863, 0x4717, 0x4872, 0x47c1, 0x46d5, 0x488e, 0x47e2, 0x485f, 0x487c, 0x48b8, 0x481e, 0x4788, 0x48bd, 0x4677, 0x46c9, 0x47f8, 0x48fe, 0x47fc, +0x47a0, 0x47b2, 0x4588, 0x467e, 0x4662, 0x46c7, 0x46e8, 0x4812, 0x4536, 0x474e, 0x46c0, 0x468f, 0x481f, 0x4679, 0x46a1, 0x46e2, 0x4809, 0x4560, 0x4630, 0x47eb, 0x46b5, 0x4757, 0x4848, 0x477f, 0x46a6, 0x46d8, 0x4870, 0x459a, 0x4670, 0x4678, 0x47d2, 0x468c, +0x4762, 0x48c4, 0x46e3, 0x4791, 0x46b1, 0x486d, 0x47d0, 0x4867, 0x468d, 0x47f6, 0x48a5, 0x4756, 0x4857, 0x4854, 0x4866, 0x4838, 0x484d, 0x46ec, 0x47d2, 0x48f6, 0x484a, 0x4879, 0x4848, 0x483c, 0x471d, 0x4806, 0x48fa, 0x4730, 0x4768, 0x47b8, 0x4865, 0x46f9, +0x48a8, 0x4918, 0x46ca, 0x4867, 0x4800, 0x4862, 0x48d3, 0x4910, 0x474e, 0x4849, 0x48eb, 0x486b, 0x4966, 0x48c5, 0x48f4, 0x4830, 0x48f9, 0x4778, 0x481e, 0x499e, 0x48cf, 0x48f1, 0x4982, 0x4923, 0x487c, 0x47cf, 0x49ea, 0x4649, 0x4773, 0x495e, 0x48b2, 0x483f, +0x48a7, 0x4975, 0x4616, 0x481e, 0x481f, 0x4866, 0x48b6, 0x4864, 0x47dc, 0x4873, 0x485c, 0x487f, 0x4938, 0x491f, 0x490d, 0x48b6, 0x48f8, 0x48a1, 0x4859, 0x492d, 0x489c, 0x4915, 0x4899, 0x4887, 0x486c, 0x4859, 0x49ca, 0x471e, 0x4867, 0x4918, 0x48d3, 0x4827, +0x488b, 0x4998, 0x4704, 0x481d, 0x48b8, 0x4880, 0x4876, 0x4944, 0x470c, 0x48f2, 0x48b9, 0x489b, 0x4956, 0x48e5, 0x48d6, 0x48a5, 0x48dc, 0x4856, 0x484e, 0x49ab, 0x48e0, 0x490e, 0x48dd, 0x4945, 0x488b, 0x48dd, 0x4a32, 0x47ea, 0x4835, 0x4911, 0x4965, 0x4819, +0x460e, 0x481e, 0x452c, 0x4673, 0x475c, 0x4717, 0x46f6, 0x46d0, 0x4696, 0x46bc, 0x4726, 0x481e, 0x4763, 0x46ea, 0x46fe, 0x4758, 0x478b, 0x4627, 0x4704, 0x483f, 0x46ad, 0x47b1, 0x4792, 0x4816, 0x46f2, 0x4684, 0x4827, 0x45a8, 0x472f, 0x47a4, 0x4797, 0x462b, +0x483f, 0x48ab, 0x468f, 0x4863, 0x485a, 0x4766, 0x481d, 0x48cb, 0x47dc, 0x4903, 0x48fc, 0x4830, 0x48cc, 0x483e, 0x48ab, 0x4864, 0x4966, 0x4763, 0x4794, 0x499d, 0x488e, 0x488b, 0x48dc, 0x4960, 0x4854, 0x484c, 0x499c, 0x474c, 0x4826, 0x48bc, 0x4949, 0x4883, +0x489d, 0x4905, 0x4718, 0x481e, 0x48e3, 0x48f4, 0x48c1, 0x4904, 0x47e8, 0x48b3, 0x4892, 0x48d4, 0x48ff, 0x4894, 0x48d5, 0x4886, 0x48fa, 0x4803, 0x47d2, 0x492e, 0x4870, 0x48b2, 0x48e5, 0x492b, 0x487b, 0x4785, 0x49e3, 0x471d, 0x4837, 0x48bf, 0x489b, 0x48c4, +0x475c, 0x4871, 0x464a, 0x4811, 0x47af, 0x471c, 0x4817, 0x4817, 0x463b, 0x484e, 0x477f, 0x464f, 0x4704, 0x487c, 0x47a3, 0x4725, 0x4853, 0x462a, 0x465a, 0x4860, 0x4736, 0x4880, 0x47e1, 0x482b, 0x4811, 0x46c0, 0x48dc, 0x475d, 0x4668, 0x4806, 0x4893, 0x46f4, +0x4858, 0x4959, 0x463d, 0x487b, 0x480f, 0x484e, 0x48c0, 0x48a6, 0x4847, 0x4894, 0x48a0, 0x484a, 0x491e, 0x48f4, 0x48fc, 0x48b5, 0x48ce, 0x47d2, 0x47db, 0x497f, 0x4955, 0x4939, 0x48a7, 0x48ce, 0x4890, 0x4884, 0x49d6, 0x4763, 0x486e, 0x4922, 0x48f4, 0x48c3, +0x47ec, 0x491c, 0x4698, 0x4783, 0x4715, 0x4754, 0x4745, 0x4752, 0x472f, 0x4832, 0x4817, 0x4809, 0x47f8, 0x48c3, 0x47e6, 0x4800, 0x48b6, 0x4730, 0x480a, 0x48cb, 0x479e, 0x488e, 0x47c2, 0x488e, 0x472f, 0x47ee, 0x489d, 0x4744, 0x4755, 0x4851, 0x4846, 0x47d3, +0x4838, 0x48a0, 0x4634, 0x4762, 0x4786, 0x4806, 0x47e3, 0x482d, 0x4726, 0x486c, 0x47b7, 0x4803, 0x48ac, 0x4814, 0x48e0, 0x4839, 0x4827, 0x4750, 0x46f2, 0x48c5, 0x483f, 0x4886, 0x48ad, 0x4856, 0x47e8, 0x47a9, 0x4937, 0x4743, 0x46d0, 0x481f, 0x484c, 0x4804, +0x47fd, 0x481f, 0x456d, 0x4813, 0x474d, 0x4807, 0x4688, 0x480e, 0x46e8, 0x4810, 0x469f, 0x4799, 0x4853, 0x478f, 0x47f2, 0x4824, 0x47d0, 0x471f, 0x46da, 0x485f, 0x4813, 0x481c, 0x482e, 0x4863, 0x4786, 0x480b, 0x48c9, 0x46b8, 0x475a, 0x46e2, 0x4852, 0x46c5, +0x45af, 0x4802, 0x4466, 0x46c2, 0x465d, 0x4743, 0x46b7, 0x47ba, 0x4636, 0x46c3, 0x4677, 0x4784, 0x485a, 0x47c2, 0x46dc, 0x46ac, 0x47de, 0x460e, 0x465f, 0x4834, 0x47f4, 0x4769, 0x46fc, 0x4810, 0x45fd, 0x45ea, 0x48d0, 0x45b5, 0x4704, 0x4783, 0x4830, 0x46c4, +0x4759, 0x47c7, 0x453d, 0x45b0, 0x4741, 0x4702, 0x4736, 0x4793, 0x461b, 0x47ba, 0x470b, 0x46dd, 0x4657, 0x470b, 0x470d, 0x4710, 0x486c, 0x468f, 0x45c3, 0x46ba, 0x479d, 0x483b, 0x46c9, 0x4774, 0x46a9, 0x46a7, 0x4833, 0x4606, 0x4690, 0x46a9, 0x46f5, 0x46a7, +0x47ac, 0x48bb, 0x452c, 0x4803, 0x470f, 0x4824, 0x47d5, 0x48cb, 0x4707, 0x484a, 0x4832, 0x4797, 0x4851, 0x482c, 0x487a, 0x4877, 0x4891, 0x465d, 0x47f4, 0x48ce, 0x4898, 0x4899, 0x484e, 0x486a, 0x47ac, 0x47f0, 0x493e, 0x4611, 0x47e2, 0x489e, 0x488c, 0x46af, +0x4665, 0x4836, 0x45e4, 0x46b6, 0x46a1, 0x46b9, 0x46c8, 0x46dd, 0x4658, 0x474b, 0x467b, 0x4777, 0x4769, 0x4798, 0x4785, 0x475e, 0x472a, 0x4656, 0x45fb, 0x4881, 0x46fc, 0x472d, 0x476e, 0x47a3, 0x465d, 0x46ca, 0x4855, 0x4500, 0x464f, 0x479a, 0x46c3, 0x4738, +0x481e, 0x486c, 0x4659, 0x4801, 0x4756, 0x477a, 0x47d5, 0x487b, 0x4706, 0x4808, 0x484f, 0x4838, 0x4870, 0x4863, 0x48d3, 0x4806, 0x4865, 0x4771, 0x46be, 0x494c, 0x4915, 0x484c, 0x4900, 0x4862, 0x481a, 0x46e8, 0x4974, 0x46a0, 0x4775, 0x483d, 0x487c, 0x480e +}; \ No newline at end of file diff --git a/hwpe/redmule_256iter/inc/z_output.h b/hwpe/redmule_256iter/inc/z_output.h new file mode 100644 index 0000000..96c7e5f --- /dev/null +++ b/hwpe/redmule_256iter/inc/z_output.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t z_oup [768] = { +0x4845, 0x4897, 0x4608, 0x4838, 0x4855, 0x487b, 0x4869, 0x4880, 0x46d1, 0x48b0, 0x48db, 0x483f, 0x48c9, 0x485f, 0x4881, 0x483a, 0x484b, 0x472c, 0x4762, 0x492b, 0x4822, 0x48fd, 0x488e, 0x492e, 0x483e, 0x484f, 0x49e8, 0x46d7, 0x484b, 0x489d, 0x490b, 0x47e9, +0x484f, 0x47d2, 0x44be, 0x4747, 0x47c7, 0x46c0, 0x4727, 0x48af, 0x46c5, 0x482d, 0x483d, 0x482e, 0x4897, 0x479f, 0x488b, 0x4749, 0x489a, 0x46a8, 0x46f2, 0x488b, 0x4891, 0x47e8, 0x4872, 0x483d, 0x4716, 0x46fd, 0x49b5, 0x46a0, 0x46e7, 0x47a4, 0x48a1, 0x4767, +0x4939, 0x4935, 0x4703, 0x48c1, 0x4863, 0x48bd, 0x4913, 0x48cf, 0x48b6, 0x48b8, 0x4946, 0x4920, 0x495e, 0x48e1, 0x4938, 0x48b2, 0x493a, 0x4882, 0x483b, 0x49d5, 0x4911, 0x4972, 0x496b, 0x49df, 0x48f2, 0x4888, 0x4a46, 0x4821, 0x48c1, 0x490c, 0x49b2, 0x48a3, +0x463a, 0x47b0, 0x44cb, 0x4762, 0x4765, 0x46b9, 0x466a, 0x4814, 0x4631, 0x4796, 0x4666, 0x474b, 0x4798, 0x4704, 0x4838, 0x4761, 0x47d3, 0x4590, 0x45ea, 0x48a2, 0x47f1, 0x4844, 0x484b, 0x4776, 0x47d6, 0x46d8, 0x48f3, 0x44d3, 0x46fa, 0x478d, 0x481e, 0x466e, +0x4827, 0x481e, 0x45a2, 0x4794, 0x4727, 0x4806, 0x475d, 0x48d5, 0x4708, 0x4828, 0x4862, 0x480d, 0x4895, 0x4832, 0x48bd, 0x47f1, 0x482a, 0x46a7, 0x47b1, 0x492d, 0x484d, 0x4884, 0x48dc, 0x485f, 0x476d, 0x480c, 0x48e9, 0x46d3, 0x4728, 0x4884, 0x48a0, 0x480e, +0x4862, 0x4813, 0x4675, 0x485a, 0x47e8, 0x4738, 0x4836, 0x4823, 0x46e7, 0x4821, 0x4822, 0x47b3, 0x4846, 0x4855, 0x4863, 0x4717, 0x4872, 0x47c1, 0x46d5, 0x488e, 0x47e2, 0x485f, 0x487c, 0x48b8, 0x481e, 0x4788, 0x48bd, 0x4677, 0x46c9, 0x47f8, 0x48fe, 0x47fc, +0x47a0, 0x47b2, 0x4588, 0x467e, 0x4662, 0x46c7, 0x46e8, 0x4812, 0x4536, 0x474e, 0x46c0, 0x468f, 0x481f, 0x4679, 0x46a1, 0x46e2, 0x4809, 0x4560, 0x4630, 0x47eb, 0x46b5, 0x4757, 0x4848, 0x477f, 0x46a6, 0x46d8, 0x4870, 0x459a, 0x4670, 0x4678, 0x47d2, 0x468c, +0x4762, 0x48c4, 0x46e3, 0x4791, 0x46b1, 0x486d, 0x47d0, 0x4867, 0x468d, 0x47f6, 0x48a5, 0x4756, 0x4857, 0x4854, 0x4866, 0x4838, 0x484d, 0x46ec, 0x47d2, 0x48f6, 0x484a, 0x4879, 0x4848, 0x483c, 0x471d, 0x4806, 0x48fa, 0x4730, 0x4768, 0x47b8, 0x4865, 0x46f9, +0x48a8, 0x4918, 0x46ca, 0x4867, 0x4800, 0x4862, 0x48d3, 0x4910, 0x474e, 0x4849, 0x48eb, 0x486b, 0x4966, 0x48c5, 0x48f4, 0x4830, 0x48f9, 0x4778, 0x481e, 0x499e, 0x48cf, 0x48f1, 0x4982, 0x4923, 0x487c, 0x47cf, 0x49ea, 0x4649, 0x4773, 0x495e, 0x48b2, 0x483f, +0x48a7, 0x4975, 0x4616, 0x481e, 0x481f, 0x4866, 0x48b6, 0x4864, 0x47dc, 0x4873, 0x485c, 0x487f, 0x4938, 0x491f, 0x490d, 0x48b6, 0x48f8, 0x48a1, 0x4859, 0x492d, 0x489c, 0x4915, 0x4899, 0x4887, 0x486c, 0x4859, 0x49ca, 0x471e, 0x4867, 0x4918, 0x48d3, 0x4827, +0x488b, 0x4998, 0x4704, 0x481d, 0x48b8, 0x4880, 0x4876, 0x4944, 0x470c, 0x48f2, 0x48b9, 0x489b, 0x4956, 0x48e5, 0x48d6, 0x48a5, 0x48dc, 0x4856, 0x484e, 0x49ab, 0x48e0, 0x490e, 0x48dd, 0x4945, 0x488b, 0x48dd, 0x4a32, 0x47ea, 0x4835, 0x4911, 0x4965, 0x4819, +0x460e, 0x481e, 0x452c, 0x4673, 0x475c, 0x4717, 0x46f6, 0x46d0, 0x4696, 0x46bc, 0x4726, 0x481e, 0x4763, 0x46ea, 0x46fe, 0x4758, 0x478b, 0x4627, 0x4704, 0x483f, 0x46ad, 0x47b1, 0x4792, 0x4816, 0x46f2, 0x4684, 0x4827, 0x45a8, 0x472f, 0x47a4, 0x4797, 0x462b, +0x483f, 0x48ab, 0x468f, 0x4863, 0x485a, 0x4766, 0x481d, 0x48cb, 0x47dc, 0x4903, 0x48fc, 0x4830, 0x48cc, 0x483e, 0x48ab, 0x4864, 0x4966, 0x4763, 0x4794, 0x499d, 0x488e, 0x488b, 0x48dc, 0x4960, 0x4854, 0x484c, 0x499c, 0x474c, 0x4826, 0x48bc, 0x4949, 0x4883, +0x489d, 0x4905, 0x4718, 0x481e, 0x48e3, 0x48f4, 0x48c1, 0x4904, 0x47e8, 0x48b3, 0x4892, 0x48d4, 0x48ff, 0x4894, 0x48d5, 0x4886, 0x48fa, 0x4803, 0x47d2, 0x492e, 0x4870, 0x48b2, 0x48e5, 0x492b, 0x487b, 0x4785, 0x49e3, 0x471d, 0x4837, 0x48bf, 0x489b, 0x48c4, +0x475c, 0x4871, 0x464a, 0x4811, 0x47af, 0x471c, 0x4817, 0x4817, 0x463b, 0x484e, 0x477f, 0x464f, 0x4704, 0x487c, 0x47a3, 0x4725, 0x4853, 0x462a, 0x465a, 0x4860, 0x4736, 0x4880, 0x47e1, 0x482b, 0x4811, 0x46c0, 0x48dc, 0x475d, 0x4668, 0x4806, 0x4893, 0x46f4, +0x4858, 0x4959, 0x463d, 0x487b, 0x480f, 0x484e, 0x48c0, 0x48a6, 0x4847, 0x4894, 0x48a0, 0x484a, 0x491e, 0x48f4, 0x48fc, 0x48b5, 0x48ce, 0x47d2, 0x47db, 0x497f, 0x4955, 0x4939, 0x48a7, 0x48ce, 0x4890, 0x4884, 0x49d6, 0x4763, 0x486e, 0x4922, 0x48f4, 0x48c3, +0x47ec, 0x491c, 0x4698, 0x4783, 0x4715, 0x4754, 0x4745, 0x4752, 0x472f, 0x4832, 0x4817, 0x4809, 0x47f8, 0x48c3, 0x47e6, 0x4800, 0x48b6, 0x4730, 0x480a, 0x48cb, 0x479e, 0x488e, 0x47c2, 0x488e, 0x472f, 0x47ee, 0x489d, 0x4744, 0x4755, 0x4851, 0x4846, 0x47d3, +0x4838, 0x48a0, 0x4634, 0x4762, 0x4786, 0x4806, 0x47e3, 0x482d, 0x4726, 0x486c, 0x47b7, 0x4803, 0x48ac, 0x4814, 0x48e0, 0x4839, 0x4827, 0x4750, 0x46f2, 0x48c5, 0x483f, 0x4886, 0x48ad, 0x4856, 0x47e8, 0x47a9, 0x4937, 0x4743, 0x46d0, 0x481f, 0x484c, 0x4804, +0x47fd, 0x481f, 0x456d, 0x4813, 0x474d, 0x4807, 0x4688, 0x480e, 0x46e8, 0x4810, 0x469f, 0x4799, 0x4853, 0x478f, 0x47f2, 0x4824, 0x47d0, 0x471f, 0x46da, 0x485f, 0x4813, 0x481c, 0x482e, 0x4863, 0x4786, 0x480b, 0x48c9, 0x46b8, 0x475a, 0x46e2, 0x4852, 0x46c5, +0x45af, 0x4802, 0x4466, 0x46c2, 0x465d, 0x4743, 0x46b7, 0x47ba, 0x4636, 0x46c3, 0x4677, 0x4784, 0x485a, 0x47c2, 0x46dc, 0x46ac, 0x47de, 0x460e, 0x465f, 0x4834, 0x47f4, 0x4769, 0x46fc, 0x4810, 0x45fd, 0x45ea, 0x48d0, 0x45b5, 0x4704, 0x4783, 0x4830, 0x46c4, +0x4759, 0x47c7, 0x453d, 0x45b0, 0x4741, 0x4702, 0x4736, 0x4793, 0x461b, 0x47ba, 0x470b, 0x46dd, 0x4657, 0x470b, 0x470d, 0x4710, 0x486c, 0x468f, 0x45c3, 0x46ba, 0x479d, 0x483b, 0x46c9, 0x4774, 0x46a9, 0x46a7, 0x4833, 0x4606, 0x4690, 0x46a9, 0x46f5, 0x46a7, +0x47ac, 0x48bb, 0x452c, 0x4803, 0x470f, 0x4824, 0x47d5, 0x48cb, 0x4707, 0x484a, 0x4832, 0x4797, 0x4851, 0x482c, 0x487a, 0x4877, 0x4891, 0x465d, 0x47f4, 0x48ce, 0x4898, 0x4899, 0x484e, 0x486a, 0x47ac, 0x47f0, 0x493e, 0x4611, 0x47e2, 0x489e, 0x488c, 0x46af, +0x4665, 0x4836, 0x45e4, 0x46b6, 0x46a1, 0x46b9, 0x46c8, 0x46dd, 0x4658, 0x474b, 0x467b, 0x4777, 0x4769, 0x4798, 0x4785, 0x475e, 0x472a, 0x4656, 0x45fb, 0x4881, 0x46fc, 0x472d, 0x476e, 0x47a3, 0x465d, 0x46ca, 0x4855, 0x4500, 0x464f, 0x479a, 0x46c3, 0x4738, +0x481e, 0x486c, 0x4659, 0x4801, 0x4756, 0x477a, 0x47d5, 0x487b, 0x4706, 0x4808, 0x484f, 0x4838, 0x4870, 0x4863, 0x48d3, 0x4806, 0x4865, 0x4771, 0x46be, 0x494c, 0x4915, 0x484c, 0x4900, 0x4862, 0x481a, 0x46e8, 0x4974, 0x46a0, 0x4775, 0x483d, 0x487c, 0x480e +}; \ No newline at end of file diff --git a/hwpe/redmule_256iter/pulp_inject_fault.tcl b/hwpe/redmule_256iter/pulp_inject_fault.tcl new file mode 100644 index 0000000..61ccadf --- /dev/null +++ b/hwpe/redmule_256iter/pulp_inject_fault.tcl @@ -0,0 +1,53 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 +# +# Author: Michael Rogenmoser (michaero@iis.ee.ethz.ch) + +transcript quietly +if {! [info exists ::env(VSIM_PATH)]} {error "Define VSIM_PATH"} +set utils_base_path [file join $::env(VSIM_PATH) scripts fault_injection_utils] +set script_base_path [file join $::env(VSIM_PATH) fault_injection_sim scripts] + +set verbosity 2 +set log_injections 1 +# Easy way to generate a variable seed +# set seed [clock seconds] +# Default value +set seed 12345 +set print_statistics 1 + +set inject_start_time 550000000000ps +set inject_stop_time 750000000000ps +set injection_clock "pulp_cluster_tb/cluster_i/clk_i" +set injection_clock_trigger 0 +set fault_period 150 +set rand_initial_injection_phase 0 +# max_num set to 0 means until stop_time +set max_num_fault_inject 0 +set signal_fault_duration 20ns +set register_fault_duration 0ns + +set allow_multi_bit_upset $::env(MULTI_BIT_UPSET) +set use_bitwidth_as_weight 0 +set check_core_output_modification 0 +set check_core_next_state_modification 0 +set reg_to_sig_ratio 1 + +source [file join $utils_base_path pulp_extract_nets.tcl] + +set inject_signals_netlist [] +set inject_register_netlist [] +set output_netlist [] +set next_state_netlist [] +set assertion_disable_list [] + +# for {set idx 0} {$idx < 12} {incr idx} { +# set inject_signals_netlist [list {*}$inject_signals_netlist {*}[get_all_core_nets $idx]] +# set output_netlist [list {*}$output_netlist {*}[get_core_output_nets $idx]] +# } + +set inject_register_netlist [list {*}$inject_register_netlist {*}[get_memory_slice {0 16} {256 336}]] + +source [file join $script_base_path inject_fault.tcl] + diff --git a/hwpe/redmule_256iter/redmule.c b/hwpe/redmule_256iter/redmule.c new file mode 100644 index 0000000..77019ae --- /dev/null +++ b/hwpe/redmule_256iter/redmule.c @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2022-2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Yvan Tortorella + * + * RedMulE SW test + */ + +#include +#include "stdio.h" +#include "archi_redmule.h" +#include "hal_redmule.h" +#include "pulp.h" + +#define NB_ITER 255 + +int main() { + + volatile int errors = 0; + unsigned int cluster_id = rt_cluster_id(); + unsigned int intc_data_correctable_cnt, redmule_data_correctable_cnt = 0; + unsigned int intc_meta_correctable_cnt = 0; + unsigned int intc_data_uncorrectable_cnt, redmule_data_uncorrectable_cnt = 0; + unsigned int intc_meta_uncorrectable_cnt = 0; + + if(get_core_id() == 0){ + + uint16_t m_size = M_SIZE; + uint16_t n_size = N_SIZE; + uint16_t k_size = K_SIZE; + + uint8_t *x_ext = x_inp; + uint8_t *w_ext = w_inp; + uint8_t *y_ext = y_inp; + uint8_t *z_ext = z_oup; + + uint8_t volatile *x = (uint8_t volatile *) pi_l1_malloc(0, (2*m_size*n_size)); + uint8_t volatile *w = (uint8_t volatile *) pi_l1_malloc(0, (2*n_size*k_size)); + uint8_t volatile *y = (uint8_t volatile *) pi_l1_malloc(0, (2*m_size*k_size)); + uint8_t volatile *z = (uint8_t volatile *) pi_l1_malloc(0, (2*m_size*k_size)); + + #ifdef USE_DMA + volatile unsigned int dma_id = 0; + dma_id = mchan_alloc(); + mchan_transfer((unsigned int) 2*(2*m_size*n_size), + (unsigned int) x_ext, + (unsigned int) x ); + mchan_barrier(dma_id); + mchan_free(dma_id); + + dma_id = mchan_alloc(); + mchan_transfer((unsigned int) 2*(2*n_size*k_size), + (unsigned int) w_ext, + (unsigned int) w ); + mchan_barrier(dma_id); + mchan_free(dma_id); + + dma_id = mchan_alloc(); + mchan_transfer((unsigned int) 2*(2*m_size*k_size), + (unsigned int) y_ext, + (unsigned int) y ); + mchan_barrier(dma_id); + #else + generate_test_data16((int) x, (int) w, (int) y, (int) m_size, (int) n_size, (int) k_size); + #endif + + int gold_sum = 0, check_sum = 0; + int i,j; + + int offload_id_tmp, offload_id; + + // Enable RedMulE + hwpe_cg_enable(); + asm volatile("": : :"memory"); + + hwpe_soft_clear(); + asm volatile("": : :"memory"); + + volatile int job_id = -1; + + // job 0 + job_id = hwpe_wait_acquire(); + asm volatile("": : :"memory"); + redmule_x_add_set ((unsigned int) x); + redmule_w_add_set ((unsigned int) w); + redmule_y_add_set ((unsigned int) y); + redmule_z_add_set ((unsigned int) z); + redmule_cfg (m_size, n_size, k_size, gemm_ops); + asm volatile("": : :"memory"); + hwpe_trigger_job(); + asm volatile("": : :"memory"); + + // job 1 + job_id = hwpe_wait_acquire(); + asm volatile("": : :"memory"); + redmule_x_add_set ((unsigned int) x); + redmule_w_add_set ((unsigned int) w); + redmule_y_add_set ((unsigned int) y); + redmule_z_add_set ((unsigned int) z); + redmule_cfg (m_size, n_size, k_size, gemm_ops); + asm volatile("": : :"memory"); + hwpe_trigger_job(); + asm volatile("": : :"memory"); + + // jobs 2-255 + do { + job_id = hwpe_wait_acquire(); + asm volatile("": : :"memory"); + hwpe_trigger_job(); + asm volatile("": : :"memory"); + } while(job_id < NB_ITER); + + // Wait for end of computation + redmule_evt_wait(); + + // Disable RedMulE + hwpe_cg_disable(); + + errors = redmule_compare16((int) z, (int) m_size, (int) k_size); + + *(int *) 0x1A1040A0 = errors; + + printf ("Terminated test with %d errors. See you!\n", errors); + + } + synch_barrier(); + return (errors != 0); +} diff --git a/hwpe/redmule_softclear/Makefile b/hwpe/redmule_softclear/Makefile new file mode 100644 index 0000000..88346b6 --- /dev/null +++ b/hwpe/redmule_softclear/Makefile @@ -0,0 +1,20 @@ +PULP_APP = test +PULP_APP_SRCS = redmule.c +PULP_CFLAGS = -O3 + +ifeq ($(use_dma),1) + PULP_CFLAGS += -DUSE_DMA +endif + +ifeq ($(fault_inject),1) + export FAULT_INJECTION=1 + export FAULT_INJECTION_SCRIPT=$(CURDIR)/pulp_inject_fault.tcl +endif + +ifeq ($(multi_bit_upset),1) + export MULTI_BIT_UPSET=1 +else + export MULTI_BIT_UPSET=0 +endif + +include $(PULP_SDK_HOME)/install/rules/pulp_rt.mk diff --git a/hwpe/redmule_softclear/archi_redmule.h b/hwpe/redmule_softclear/archi_redmule.h new file mode 100644 index 0000000..40eceee --- /dev/null +++ b/hwpe/redmule_softclear/archi_redmule.h @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2022-2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Yvan Tortorella + * + * High-level architecture of RedMulE + * + */ + +#ifndef __ARCHI_REDMULE_H__ +#define __ARCHI_REDMULE_H__ + +/* + * |========================================================================| + * || || + * ||Control and generic configuration register layout || + * |========================================================================| + * || # reg | offset | bits | bitmask || content || + * ||-------+----------+---------+--------------++-------------------------|| + * || 0 | 0x0000 | 31: 0 | 0xFFFFFFFF || TRIGGER || + * || 1 | 0x0004 | 31: 0 | 0xFFFFFFFF || ACQUIRE || + * || 2 | 0x0008 | 31: 0 | 0xFFFFFFFF || EVT_ENABLE || + * || 3 | 0x000c | 31: 0 | 0xFFFFFFFF || STATUS || + * || 4 | 0x0010 | 31: 0 | 0xFFFFFFFF || RUNNING_JOB || + * || 5 | 0x0014 | 31: 0 | 0xFFFFFFFF || SOFT_CLEAR || + * |========================================================================| + * || || + * ||Job-dependent registers layout || + * |========================================================================| + * || # reg | offset | bits | bitmask || content || + * ||-------+----------+---------+--------------++-------------------------|| + * || 0 | 0x0040 | 31: 0 | 0xFFFFFFFF || X_ADDR || + * ||-------+----------+---------+--------------++-------------------------|| + * || 1 | 0x0044 | 31: 0 | 0xFFFFFFFF || W_ADDR || + * ||-------+----------+---------+--------------++-------------------------|| + * || 2 | 0x0048 | 31: 0 | 0xFFFFFFFF || Z_ADDR || + * ||-------+----------+---------+--------------++-------------------------|| + * || 3 | 0x004C | | || Matrix Config 0 Reg || + * || | | 31:16 | 0xFFFF0000 || K Size (W Columns) || + * || | | 15: 0 | 0x0000FFFF || M Size (X Rows) || + * ||-------+----------+---------+--------------++-------------------------|| + * || 4 | 0x0050 | | || Matrix Config 1 Reg || + * || | | 31:16 | 0xFFFFFFFF || N Size (X Cols/W Rows) || + * ||-------+----------+---------+--------------++-------------------------|| + * || 5 | 0x0054 | | || Matrix Arithmetic Reg || + * || | | 12:10 | 0x00001C00 || Operation selection || + * || | | 9: 7 | 0x00000380 || Input/Output format || + * |========================================================================| + * + */ + +/* PULP Cluster Archi defines */ +#define ARCHI_CLUST_CTRL_BASE ARCHI_CLUSTER_CTRL_ADDR +#define ARCHI_CLUST_HWPE_BASE ARCHI_HWCE_ADDR +#define DMA_COMMAND_QUEUE ARCHI_MCHAN_DEMUX_ADDR +#define DMA_STATUS_REGISTER (ARCHI_MCHAN_DEMUX_ADDR + 4) +#define ARCHI_CL_HWPE_EVT0 12 +#define ARCHI_CL_HWPE_EVT1 13 +#define FC_DMA_EVENT 8 +#define CL_DMA_EVENT 22 +#define CLUST_CTRL_HWPE_EN 0x18 +#define CLUST_CTRL_HWPE_EN_MASK 0x800 +#define __builtin_bitinsert(a,b,c,d) (a | (((b << (32-c)) >> (32-c)) << d)) + +// RedMulE architecture +#define ADDR_WIDTH 32 +#define DATA_WIDTH 256 +#define REDMULE_FMT 16 +#define ARRAY_HEIGHT 4 +#define PIPE_REGS 3 +#define ARRAY_WIDTH 12 /* Superior limit is ARRAY_HEIGHT*PIPE_REGS */ + +// Commands +#define REDMULE_TRIGGER 0x00 +#define REDMULE_ACQUIRE 0x04 +#define REDMULE_FINISHED 0x08 +#define REDMULE_STATUS 0x0C +#define REDMULE_RUNNING_JOB 0x10 +#define REDMULE_SOFT_CLEAR 0x14 + +// Registers +#define REDMULE_REG_OFFS 0x40 +// #define REDMULE_REG_X_PTR 0x00 +// #define REDMULE_REG_W_PTR 0x04 +// #define REDMULE_REG_Z_PTR 0x08 +// #define REDMULE_MCFG0_PTR 0x0C +// #define REDMULE_MCFG1_PTR 0x10 +// #define REDMULE_ARITH_PTR 0x14 +#define REDMULE_REG_X_PTR 0x00 +#define REDMULE_REG_W_PTR 0x04 +#define REDMULE_REG_Y_PTR 0x08 +#define REDMULE_REG_Z_PTR 0x0C +#define REDMULE_REG_X_ITER_PTR 0x10 +#define REDMULE_REG_W_ITER_PTR 0x14 +#define REDMULE_REG_LEFTOVERS_PTR 0x18 +#define REDMULE_REG_LEFT_PARAMS_PTR 0x1C +#define REDMULE_REG_X_D1_STRIDE_PTR 0x20 +#define REDMULE_REG_W_TOT_LEN_PTR 0x24 +#define REDMULE_REG_TOT_X_READ_PTR 0x28 +#define REDMULE_REG_W_D0_STRIDE_PTR 0x2C +#define REDMULE_REG_YZ_TOT_LEN_PTR 0x30 +#define REDMULE_REG_YZ_D0_STRIDE_PTR 0x34 +#define REDMULE_REG_YZ_D2_STRIDE_PTR 0x38 +#define REDMULE_REG_X_ROWS_OFFS_PTR 0x3C +#define REDMULE_REG_X_BUFFER_SLOTS_PTR 0x40 +#define REDMULE_REG_X_TOT_LEN_PTR 0x44 +#define REDMULE_REG_OP_SELECTION 0x48 + +#define REDMULE_ECC_REG_OFFS 0x90 +#define DATA_CORR_ERR 0x00 +#define DATA_UNCORR_ERR 0x04 +#define METADATA_CORR_ERR 0x08 +#define METADATA_UNCORR_ERR 0x0c + +// OPs definition +#define MATMUL 0x0 +#define GEMM 0x1 +#define ADDMAX 0x2 +#define ADDMIN 0x3 +#define MULMAX 0x4 +#define MULMIN 0x5 +#define MAXMIN 0x6 +#define MINMAX 0x7 + +// GEMM formats +#define Float8 0x0 +#define Float16 0x1 +#define Float8Alt 0x2 +#define Float16Alt 0x3 + +#define RNE 0x0 +#define RTZ 0x1 +#define OP_FMADD 0x0 +#define OP_ADD 0x2 +#define OP_MUL 0x3 +#define OP_MINMAX 0x7 + +// FP Formats encoding +#define FP16 0x2 +#define FP8 0x3 +#define FP16ALT 0x4 +#define FP8ALT 0x5 + +/* DMA Archi */ +#define DMA_TX 0 +#define DMA_RX 1 +#define DMA_INC 1 + +#define PLP_DMA_TYPE_BIT 0x00000011 +#define PLP_DMA_INCR_BIT 0x00000012 +#define PLP_DMA_2D_BIT 0x00000013 +#define PLP_DMA_ELE_BIT 0x00000014 +#define PLP_DMA_ILE_BIT 0x00000015 +#define PLP_DMA_BLE_BIT 0x00000016 +#define PLP_DMA_2D_TCDM_BIT 0x0000017 + +#endif diff --git a/hwpe/redmule_softclear/hal_redmule.h b/hwpe/redmule_softclear/hal_redmule.h new file mode 100644 index 0000000..8fc5000 --- /dev/null +++ b/hwpe/redmule_softclear/hal_redmule.h @@ -0,0 +1,556 @@ +/* + * Copyright (C) 2022-2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Yvan Tortorella + * + * RedMulE Hardware Abstraction Layer (HAL) + */ + +#ifndef __HAL_REDMULE_H__ +#define __HAL_REDMULE_H__ + +#include +#include "inc/x_input.h" +#include "inc/w_input.h" +#include "inc/y_input.h" +#include "inc/z_output.h" +#include "inc/golden.h" +#include "inc/tensor_dim.h" + +/* + * + * For control, generic configuration register layout, + * and job-dependent register map, look at redmule_archi.h + * + */ + +// For all the following functions we use __builtin_pulp_OffsetedWrite and __builtin_pulp_OffsetedRead +// instead of classic load/store because otherwise the compiler is not able to correctly factorize +// the HWPE base in case several accesses are done, ending up with twice more code + +#define HWPE_WRITE(value, offset) *(int *)(ARCHI_CLUST_HWPE_BASE + offset) = value +#define HWPE_READ(offset) *(int *)(ARCHI_CLUST_HWPE_BASE + offset) + +static inline void redmule_x_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_X_PTR); +} + +static inline void redmule_w_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_W_PTR); +} + +static inline void redmule_y_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_Y_PTR); +} + +static inline void redmule_z_add_set (unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_Z_PTR); +} + +// static inline void redmule_mcfg_set (uint32_t mcfg0, uint32_t mcfg1) { +// HWPE_WRITE(mcfg0, REDMULE_REG_OFFS + REDMULE_MCFG0_PTR); +// HWPE_WRITE(mcfg1, REDMULE_REG_OFFS + REDMULE_MCFG1_PTR); +// } +// +// static inline void redmule_arith_set (uint32_t arith) { +// HWPE_WRITE(arith, REDMULE_REG_OFFS + REDMULE_ARITH_PTR); +// } + +static inline void hwpe_trigger_job() { + HWPE_WRITE(0, REDMULE_TRIGGER); +} + +static inline int hwpe_acquire_job() { + return HWPE_READ(REDMULE_ACQUIRE); +} + +static inline unsigned int hwpe_get_status() { + return HWPE_READ(REDMULE_STATUS); +} + +static inline unsigned int hwpe_get_running_job() { + return HWPE_READ(REDMULE_RUNNING_JOB); +} + +static inline void hwpe_soft_clear() { + HWPE_WRITE(0, REDMULE_SOFT_CLEAR); +} + +static inline void hwpe_cg_enable() { + *(volatile int*) (ARCHI_CLUST_CTRL_BASE + CLUST_CTRL_HWPE_EN) |= CLUST_CTRL_HWPE_EN_MASK; +} + +static inline void hwpe_cg_disable() { + *(volatile int*) (ARCHI_CLUST_CTRL_BASE + CLUST_CTRL_HWPE_EN) &= ~CLUST_CTRL_HWPE_EN_MASK; +} + +static inline void redmule_evt_wait() { + do { + eu_evt_maskWaitAndClr (1 << ARCHI_CL_HWPE_EVT0); + } while((*(int volatile *)(ARCHI_CLUST_HWPE_BASE + REDMULE_STATUS)) != 0); +} + +static inline int hwpe_wait_acquire() { + int job_id = hwpe_acquire_job(); + while(job_id < 0) { + eu_evt_maskWaitAndClr (1 << ARCHI_CL_HWPE_EVT0); + job_id = hwpe_acquire_job(); + } + return job_id; +} + +static inline unsigned int redmule_get_data_correctable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + DATA_CORR_ERR); +} + +static inline unsigned int redmule_get_data_uncorrectable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + DATA_UNCORR_ERR); +} + +static inline unsigned int redmule_get_meta_correctable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + METADATA_CORR_ERR); +} + +static inline unsigned int redmule_get_meta_uncorrectable_count () { + return HWPE_READ(REDMULE_ECC_REG_OFFS + METADATA_UNCORR_ERR); +} + +/* DMA APIs */ +static inline int mchan_alloc(){ + return *(volatile int*) DMA_COMMAND_QUEUE; +} + +static inline void mchan_transfer(unsigned int len, + unsigned int ext_addr, + unsigned int tcdm_addr) { + + *(volatile int*) DMA_COMMAND_QUEUE = len | + (DMA_RX << PLP_DMA_TYPE_BIT) | + (DMA_INC << PLP_DMA_INCR_BIT) | + (0 << PLP_DMA_2D_BIT) | + (1 << PLP_DMA_ELE_BIT) | + (1 << PLP_DMA_ILE_BIT) | + (0 << PLP_DMA_BLE_BIT) | + (0 << PLP_DMA_2D_TCDM_BIT); + *(volatile int*) DMA_COMMAND_QUEUE = tcdm_addr; + *(volatile int*) DMA_COMMAND_QUEUE = ext_addr; +} + +static inline void mchan_barrier(int id) { + while(((*(volatile int*)(DMA_STATUS_REGISTER)) >> id ) & 0x1 ) { + eu_evt_maskWaitAndClr(1 << FC_DMA_EVENT); + } +} + +static inline void mchan_free(int id) { + *(volatile int*) DMA_STATUS_REGISTER = 0x1 << id; +} + +// void redmule_cfg (unsigned int x, unsigned int w, unsigned int z, +// uint16_t m_size, uint16_t n_size, uint16_t k_size, +// uint8_t gemm_op, uint8_t gemm_fmt){ +// +// uint32_t mcfg_reg0 = 0; +// uint32_t mcfg_reg1 = 0; +// uint32_t arith_reg = 0; +// +// mcfg_reg0 = (k_size << 16) | +// (m_size << 0); +// mcfg_reg1 = n_size << 0; +// +// arith_reg = (gemm_op << 10) | +// (gemm_fmt << 7); +// +// redmule_x_add_set ((unsigned int) x); +// redmule_w_add_set ((unsigned int) w); +// redmule_z_add_set ((unsigned int) z); +// redmule_mcfg_set ((unsigned int) mcfg_reg0, +// (unsigned int) mcfg_reg1); +// redmule_arith_set ((unsigned int) arith_reg); +// +// } + +void redmule_cfg (uint16_t m_size, uint16_t n_size, uint16_t k_size, uint8_t gemm_ops){ + uint32_t x_iters = 0; + uint32_t w_iters = 0; + uint32_t leftovers = 0; + uint32_t left_params = 0; + uint32_t x_d1_stride = 0; + uint32_t x_rows_offs = 0; + uint32_t w_tot_len = 0; + uint32_t w_d1_len = 0; + uint32_t w_d0_stride = 0; + uint32_t yz_tot_len = 0; + uint32_t yz_d0_stride = 0; + uint32_t yz_d2_stride = 0; + uint32_t tot_x_read = 0; + uint32_t x_buffer_slots = 0; + uint32_t op_selection = 0; + uint16_t tot_stores = 0; + uint16_t w_rows = n_size; + uint16_t depth = DATA_WIDTH/(ARRAY_HEIGHT*FPFORMAT); + uint8_t tile = ARRAY_HEIGHT*(PIPE_REGS + 1); + _Bool x_rows_sub = 0; + _Bool x_cols_sub = 0; + _Bool w_cols_sub = 0; + uint16_t x_rows_iter, + x_rows_iter_tmp, + w_rows_iter, + w_rows_iter_tmp; + uint16_t x_cols_iter, + x_cols_iter_tmp, + w_cols_iter, + w_cols_iter_tmp; + uint8_t x_rows_lftovr, + x_cols_lftovr, + w_rows_lftovr, + w_cols_lftovr, + slots; + + // Calculating the number of iterations alng the two dimensions of the X matrix + x_rows_iter_tmp = m_size/ARRAY_WIDTH; + x_cols_iter_tmp = n_size/tile; + + // Calculating the number of iterations alng the two dimensions of the W matrix + w_rows_iter_tmp = w_rows; + w_cols_iter_tmp = k_size/tile; + + // Calculating the residuals along the input dimensions + x_rows_lftovr = m_size - (x_rows_iter_tmp*ARRAY_WIDTH); + x_cols_lftovr = n_size - (x_cols_iter_tmp*tile); + + // Calculating the residuals along the weight dimensions + w_rows_lftovr = n_size - (ARRAY_HEIGHT*(w_rows/ARRAY_HEIGHT)); + w_cols_lftovr = k_size - (w_cols_iter_tmp*tile); + + if (w_cols_lftovr != 0) + w_cols_iter = w_cols_iter_tmp + 1; + else + w_cols_iter = w_cols_iter_tmp; + + if (w_rows_lftovr != 0) + w_rows_iter = w_rows_iter_tmp + ARRAY_HEIGHT - w_rows_lftovr; + else + w_rows_iter = w_rows_iter_tmp; + + if (x_cols_lftovr != 0) + x_cols_iter = x_cols_iter_tmp + 1; + else + x_cols_iter = x_cols_iter_tmp; + + if (x_rows_lftovr != 0) + x_rows_iter = x_rows_iter_tmp + 1; + else + x_rows_iter = x_rows_iter_tmp; + + if (x_cols_lftovr%depth != 0) + x_buffer_slots = x_cols_lftovr/depth + 1; + else + x_buffer_slots = x_cols_lftovr/depth; + + // Calculating the number of total stores + tot_stores = x_rows_iter*w_cols_iter; + + // Determining if input matrixes are sub-matrixes + if (m_size < ARRAY_WIDTH) + x_rows_sub = 1; + if (n_size < ARRAY_HEIGHT) + x_cols_sub = 1; + if (k_size < tile) + w_cols_sub = 1; + + // Operation selection + switch (gemm_ops) { + case MATMUL: + op_selection |= (RNE << 29 | RNE << 26 | OP_FMADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 0; + break; + + case GEMM: + op_selection |= (RNE << 29 | RNE << 26 | OP_FMADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case ADDMAX: + op_selection |= (RNE << 29 | RTZ << 26 | OP_ADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case ADDMIN: + op_selection |= (RNE << 29 | RNE << 26 | OP_ADD << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MULMAX: + op_selection |= (RNE << 29 | RTZ << 26 | OP_MUL << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MULMIN: + op_selection |= (RNE << 29 | RNE << 26 | OP_MUL << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MAXMIN: + op_selection |= (RTZ << 29 | RNE << 26 | OP_MINMAX << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + + case MINMAX: + op_selection |= (RNE << 29 | RTZ << 26 | OP_MINMAX << 22 | OP_MINMAX << 18 | SRC_FMT << 15 | DST_FMT << 12) | 1; + break; + } + + // Storing iterations and residuals in registers + x_iters |= x_rows_iter << 16 | x_cols_iter << 0; + w_iters |= w_rows_iter << 16 | w_cols_iter << 0; + leftovers |= x_rows_lftovr << 24 | x_cols_lftovr << 16 | w_rows_lftovr << 8 | w_cols_lftovr << 0; + left_params |= tot_stores << 16 | x_rows_sub << 15 | x_cols_sub << 14 | w_cols_sub << 13; + x_d1_stride = ((4*FPFORMAT)/ADDR_WIDTH)*(((DATA_WIDTH/FPFORMAT)*x_cols_iter_tmp) + x_cols_lftovr); + x_rows_offs = ARRAY_WIDTH*x_d1_stride; + w_tot_len = w_rows_iter*w_cols_iter*x_rows_iter; + w_d0_stride = ((4*FPFORMAT)/ADDR_WIDTH)*(((DATA_WIDTH/FPFORMAT)*w_cols_iter_tmp) + w_cols_lftovr); + yz_tot_len = ARRAY_WIDTH*x_rows_iter*w_cols_iter; + yz_d0_stride = w_d0_stride; + yz_d2_stride = ARRAY_WIDTH*w_d0_stride; + tot_x_read = x_rows_iter*x_cols_iter*w_cols_iter; + + // Writing the computations in configuration register + HWPE_WRITE(x_iters , REDMULE_REG_OFFS + REDMULE_REG_X_ITER_PTR ); + HWPE_WRITE(w_iters , REDMULE_REG_OFFS + REDMULE_REG_W_ITER_PTR ); + HWPE_WRITE(leftovers , REDMULE_REG_OFFS + REDMULE_REG_LEFTOVERS_PTR ); + HWPE_WRITE(left_params , REDMULE_REG_OFFS + REDMULE_REG_LEFT_PARAMS_PTR ); + HWPE_WRITE(x_d1_stride , REDMULE_REG_OFFS + REDMULE_REG_X_D1_STRIDE_PTR ); + HWPE_WRITE(x_rows_offs , REDMULE_REG_OFFS + REDMULE_REG_X_ROWS_OFFS_PTR ); + HWPE_WRITE(tot_x_read , REDMULE_REG_OFFS + REDMULE_REG_TOT_X_READ_PTR ); + HWPE_WRITE(x_buffer_slots, REDMULE_REG_OFFS + REDMULE_REG_X_BUFFER_SLOTS_PTR ); + HWPE_WRITE(w_tot_len , REDMULE_REG_OFFS + REDMULE_REG_W_TOT_LEN_PTR ); + HWPE_WRITE(w_d0_stride , REDMULE_REG_OFFS + REDMULE_REG_W_D0_STRIDE_PTR ); + HWPE_WRITE(yz_tot_len , REDMULE_REG_OFFS + REDMULE_REG_YZ_TOT_LEN_PTR ); + HWPE_WRITE(yz_d0_stride , REDMULE_REG_OFFS + REDMULE_REG_YZ_D0_STRIDE_PTR ); + HWPE_WRITE(yz_d2_stride , REDMULE_REG_OFFS + REDMULE_REG_YZ_D2_STRIDE_PTR ); + HWPE_WRITE(op_selection , REDMULE_REG_OFFS + REDMULE_REG_OP_SELECTION ); +} + +void generate_test_data16(int x_start_addr, + int w_start_addr, + int y_start_addr, + int m_size, + int n_size, + int k_size) { + + int x_addr = x_start_addr; + int w_addr = w_start_addr; + int y_addr = y_start_addr; + int x_end_addr = x_start_addr + (2*m_size*n_size); + int w_end_addr = w_start_addr + (2*n_size*k_size); + int y_end_addr = y_start_addr + (2*m_size*k_size); + + // Generating input stimuli from golden model + for (x_addr = x_start_addr; x_addr < x_end_addr; x_addr += 2) { + int x = x_addr - x_start_addr; + *(uint32_t *)(x_addr) = x_inp[x/2]; + } + + // Generating Weight stimuli from golden model + for (w_addr = w_start_addr; w_addr < w_end_addr; w_addr += 2) { + int w = w_addr - w_start_addr; + *(uint32_t *)(w_addr) = w_inp[w/2]; + } + + for (y_addr = y_start_addr; y_addr < y_end_addr; y_addr += 2) { + int y = y_addr - y_start_addr; + *(uint32_t *)(y_addr) = y_inp[y/2]; + } +} + +int redmule_compare16 (int z_start_addr, int m_size, int k_size) { + int err = 0; + int z_end_addr = z_start_addr + 2*m_size*k_size; + uint16_t z_computed; + uint16_t diff, diff_1, diff_2; + + for (int z_addr = z_start_addr; z_addr < z_end_addr; z_addr += 2) { + int z = z_addr - z_start_addr; + z_computed = *(uint32_t *)(z_addr); + + if ( z_computed != z_oup[z/2] ) { + diff_1 = z_computed - z_oup[z/2]; + if (diff_1 > 3) { + diff_2 = z_oup[z/2] - z_computed; + if (diff_2 > 3) { + err++; + } + } + } + } + + return err; + +} + +int redmule16_compare_int(uint32_t *actual_z, uint32_t *golden_z, int len) { + #define ERR 0x0011 + uint32_t actual_word = 0; + uint16_t actual_MSHWord, actual_LSHWord; + uint32_t golden_word = 0; + uint16_t golden_MSHWord, golden_LSHWord; + uint32_t actual = 0; + uint32_t golden = 0; + + int errors = 0; + int error; + + for (int i=0; i golden_LSHWord) ? (actual_LSHWord - golden_LSHWord) : 0; + diff = (actual_LSHWord < golden_LSHWord) ? (golden_LSHWord - actual_LSHWord) : 0; + + if (diff > ERR) { + error = 1; + #ifdef VERBOSE + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("LSW: Error!\n"); + #endif + } + + // Checking Most Significant Half-Word + actual_MSHWord = (uint16_t)((actual_word >> 16) & 0x0000FFFF); + golden_MSHWord = (uint16_t)((golden_word >> 16) & 0x0000FFFF); + + diff = (actual_MSHWord > golden_MSHWord) ? (actual_MSHWord - golden_MSHWord) : 0; + diff = (actual_MSHWord < golden_MSHWord) ? (golden_MSHWord - actual_MSHWord) : 0; + + if (diff > ERR) { + error = 1; + #ifdef VERBOSE + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("MSW: Error!\n"); + #endif + } + + errors += error; + + #ifdef DEBUG + tfp_printf(" Golden: 0x%08x; Actual: 0x%08x,\n", golden_word, actual_word); + #endif + + #ifdef VERBOSE + if(error) { + if(errors==1) tfp_printf(" golden <- actual @ address @ index\n"); + tfp_printf(" 0x%08x <- 0x%08x @ 0x%08x @ 0x%08x\n", golden_word, actual_word, (actual_z+i), i*4); + } + #endif + } + return errors; +} + +int redmule8_compare_int(uint32_t *actual_z, uint32_t *golden_z, int len) { + #define ERR 0x0011 + uint32_t actual_word = 0; + uint8_t actual_Byte0, + actual_Byte1, + actual_Byte2, + actual_Byte3; + uint32_t golden_word = 0; + uint8_t golden_Byte0, + golden_Byte1, + golden_Byte2, + golden_Byte3; + uint32_t actual = 0; + uint32_t golden = 0; + + int errors = 0; + int error; + + for (int i=0; i golden_Byte0) ? (actual_Byte0 - golden_Byte0) : 0; + diff = (actual_Byte0 < golden_Byte0) ? (golden_Byte0 - actual_Byte0) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte0: Error!\n"); + } + + // Cheching Byte1 + actual_Byte1 = (uint8_t)( (actual_word >> 8 ) & 0x000000FF); + golden_Byte1 = (uint8_t)( (golden_word >> 8 ) & 0x000000FF); + + diff = (actual_Byte1 > golden_Byte1) ? (actual_Byte1 - golden_Byte1) : 0; + diff = (actual_Byte1 < golden_Byte1) ? (golden_Byte1 - actual_Byte1) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte1: Error!\n"); + } + + // Cheching Byte2 + actual_Byte2 = (uint8_t)( (actual_word >> 16 ) & 0x000000FF); + golden_Byte2 = (uint8_t)( (golden_word >> 16 ) & 0x000000FF); + + diff = (actual_Byte2 > golden_Byte2) ? (actual_Byte2 - golden_Byte2) : 0; + diff = (actual_Byte2 < golden_Byte2) ? (golden_Byte2 - actual_Byte2) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte2: Error!\n"); + } + + // Cheching Byte3 + actual_Byte3 = (uint8_t)( (actual_word >> 24 ) & 0x000000FF); + golden_Byte3 = (uint8_t)( (golden_word >> 24 ) & 0x000000FF); + + diff = (actual_Byte3 > golden_Byte3) ? (actual_Byte3 - golden_Byte3) : 0; + diff = (actual_Byte3 < golden_Byte3) ? (golden_Byte3 - actual_Byte3) : 0; + + if (diff > ERR) { + error = 1; + tfp_printf ("diff: 0x%08x\n", diff); + tfp_printf ("Byte3: Error!\n"); + } + + errors += error; + + #ifdef DEBUG + tfp_printf(" Golden: 0x%08x; Actual: 0x%08x,\n", golden_word, actual_word); + #endif + + #ifdef VERBOSE + if(error) { + if(errors==1) tfp_printf(" golden <- actual @ address @ index\n"); + tfp_printf(" 0x%08x <- 0x%08x @ 0x%08x @ 0x%08x\n", golden_word, actual_word, (actual_z+i), i*4); + } + #endif + } + return errors; +} + +#endif diff --git a/hwpe/redmule_softclear/inc/golden.h b/hwpe/redmule_softclear/inc/golden.h new file mode 100644 index 0000000..f664e47 --- /dev/null +++ b/hwpe/redmule_softclear/inc/golden.h @@ -0,0 +1,387 @@ + /* Header file generated by RedMulE Golden Model */ +uint32_t golden [384] = { +0x48974845, +0x48384608, +0x487b4855, +0x48804869, +0x48b046d1, +0x483f48db, +0x485f48c9, +0x483a4881, +0x472c484b, +0x492b4762, +0x48fd4822, +0x492e488e, +0x484f483e, +0x46d749e8, +0x489d484b, +0x47e9490b, +0x47d2484f, +0x474744be, +0x46c047c7, +0x48af4727, +0x482d46c5, +0x482e483d, +0x479f4897, +0x4749488b, +0x46a8489a, +0x488b46f2, +0x47e84891, +0x483d4872, +0x46fd4716, +0x46a049b5, +0x47a446e7, +0x476748a1, +0x49354939, +0x48c14703, +0x48bd4863, +0x48cf4913, +0x48b848b6, +0x49204946, +0x48e1495e, +0x48b24938, +0x4882493a, +0x49d5483b, +0x49724911, +0x49df496b, +0x488848f2, +0x48214a46, +0x490c48c1, +0x48a349b2, +0x47b0463a, +0x476244cb, +0x46b94765, +0x4814466a, +0x47964631, +0x474b4666, +0x47044798, +0x47614838, +0x459047d3, +0x48a245ea, +0x484447f1, +0x4776484b, +0x46d847d6, +0x44d348f3, +0x478d46fa, +0x466e481e, +0x481e4827, +0x479445a2, +0x48064727, +0x48d5475d, +0x48284708, +0x480d4862, +0x48324895, +0x47f148bd, +0x46a7482a, +0x492d47b1, +0x4884484d, +0x485f48dc, +0x480c476d, +0x46d348e9, +0x48844728, +0x480e48a0, +0x48134862, +0x485a4675, +0x473847e8, +0x48234836, +0x482146e7, +0x47b34822, +0x48554846, +0x47174863, +0x47c14872, +0x488e46d5, +0x485f47e2, +0x48b8487c, +0x4788481e, +0x467748bd, +0x47f846c9, +0x47fc48fe, +0x47b247a0, +0x467e4588, +0x46c74662, +0x481246e8, +0x474e4536, +0x468f46c0, +0x4679481f, +0x46e246a1, +0x45604809, +0x47eb4630, +0x475746b5, +0x477f4848, +0x46d846a6, +0x459a4870, +0x46784670, +0x468c47d2, +0x48c44762, +0x479146e3, +0x486d46b1, +0x486747d0, +0x47f6468d, +0x475648a5, +0x48544857, +0x48384866, +0x46ec484d, +0x48f647d2, +0x4879484a, +0x483c4848, +0x4806471d, +0x473048fa, +0x47b84768, +0x46f94865, +0x491848a8, +0x486746ca, +0x48624800, +0x491048d3, +0x4849474e, +0x486b48eb, +0x48c54966, +0x483048f4, +0x477848f9, +0x499e481e, +0x48f148cf, +0x49234982, +0x47cf487c, +0x464949ea, +0x495e4773, +0x483f48b2, +0x497548a7, +0x481e4616, +0x4866481f, +0x486448b6, +0x487347dc, +0x487f485c, +0x491f4938, +0x48b6490d, +0x48a148f8, +0x492d4859, +0x4915489c, +0x48874899, +0x4859486c, +0x471e49ca, +0x49184867, +0x482748d3, +0x4998488b, +0x481d4704, +0x488048b8, +0x49444876, +0x48f2470c, +0x489b48b9, +0x48e54956, +0x48a548d6, +0x485648dc, +0x49ab484e, +0x490e48e0, +0x494548dd, +0x48dd488b, +0x47ea4a32, +0x49114835, +0x48194965, +0x481e460e, +0x4673452c, +0x4717475c, +0x46d046f6, +0x46bc4696, +0x481e4726, +0x46ea4763, +0x475846fe, +0x4627478b, +0x483f4704, +0x47b146ad, +0x48164792, +0x468446f2, +0x45a84827, +0x47a4472f, +0x462b4797, +0x48ab483f, +0x4863468f, +0x4766485a, +0x48cb481d, +0x490347dc, +0x483048fc, +0x483e48cc, +0x486448ab, +0x47634966, +0x499d4794, +0x488b488e, +0x496048dc, +0x484c4854, +0x474c499c, +0x48bc4826, +0x48834949, +0x4905489d, +0x481e4718, +0x48f448e3, +0x490448c1, +0x48b347e8, +0x48d44892, +0x489448ff, +0x488648d5, +0x480348fa, +0x492e47d2, +0x48b24870, +0x492b48e5, +0x4785487b, +0x471d49e3, +0x48bf4837, +0x48c4489b, +0x4871475c, +0x4811464a, +0x471c47af, +0x48174817, +0x484e463b, +0x464f477f, +0x487c4704, +0x472547a3, +0x462a4853, +0x4860465a, +0x48804736, +0x482b47e1, +0x46c04811, +0x475d48dc, +0x48064668, +0x46f44893, +0x49594858, +0x487b463d, +0x484e480f, +0x48a648c0, +0x48944847, +0x484a48a0, +0x48f4491e, +0x48b548fc, +0x47d248ce, +0x497f47db, +0x49394955, +0x48ce48a7, +0x48844890, +0x476349d6, +0x4922486e, +0x48c348f4, +0x491c47ec, +0x47834698, +0x47544715, +0x47524745, +0x4832472f, +0x48094817, +0x48c347f8, +0x480047e6, +0x473048b6, +0x48cb480a, +0x488e479e, +0x488e47c2, +0x47ee472f, +0x4744489d, +0x48514755, +0x47d34846, +0x48a04838, +0x47624634, +0x48064786, +0x482d47e3, +0x486c4726, +0x480347b7, +0x481448ac, +0x483948e0, +0x47504827, +0x48c546f2, +0x4886483f, +0x485648ad, +0x47a947e8, +0x47434937, +0x481f46d0, +0x4804484c, +0x481f47fd, +0x4813456d, +0x4807474d, +0x480e4688, +0x481046e8, +0x4799469f, +0x478f4853, +0x482447f2, +0x471f47d0, +0x485f46da, +0x481c4813, +0x4863482e, +0x480b4786, +0x46b848c9, +0x46e2475a, +0x46c54852, +0x480245af, +0x46c24466, +0x4743465d, +0x47ba46b7, +0x46c34636, +0x47844677, +0x47c2485a, +0x46ac46dc, +0x460e47de, +0x4834465f, +0x476947f4, +0x481046fc, +0x45ea45fd, +0x45b548d0, +0x47834704, +0x46c44830, +0x47c74759, +0x45b0453d, +0x47024741, +0x47934736, +0x47ba461b, +0x46dd470b, +0x470b4657, +0x4710470d, +0x468f486c, +0x46ba45c3, +0x483b479d, +0x477446c9, +0x46a746a9, +0x46064833, +0x46a94690, +0x46a746f5, +0x48bb47ac, +0x4803452c, +0x4824470f, +0x48cb47d5, +0x484a4707, +0x47974832, +0x482c4851, +0x4877487a, +0x465d4891, +0x48ce47f4, +0x48994898, +0x486a484e, +0x47f047ac, +0x4611493e, +0x489e47e2, +0x46af488c, +0x48364665, +0x46b645e4, +0x46b946a1, +0x46dd46c8, +0x474b4658, +0x4777467b, +0x47984769, +0x475e4785, +0x4656472a, +0x488145fb, +0x472d46fc, +0x47a3476e, +0x46ca465d, +0x45004855, +0x479a464f, +0x473846c3, +0x486c481e, +0x48014659, +0x477a4756, +0x487b47d5, +0x48084706, +0x4838484f, +0x48634870, +0x480648d3, +0x47714865, +0x494c46be, +0x484c4915, +0x48624900, +0x46e8481a, +0x46a04974, +0x483d4775, +0x480e487c, +}; \ No newline at end of file diff --git a/hwpe/redmule_softclear/inc/tensor_dim.h b/hwpe/redmule_softclear/inc/tensor_dim.h new file mode 100644 index 0000000..21bd0d8 --- /dev/null +++ b/hwpe/redmule_softclear/inc/tensor_dim.h @@ -0,0 +1,13 @@ + /* Header file generated by RedMulE Golden Model */ +#ifndef __TENSOR_DIM__ +#define __TENSOR_DIM__ + +#define M_SIZE 24 +#define N_SIZE 32 +#define K_SIZE 32 +#define SRC_FMT FP16 +#define DST_FMT FP16 +#define FPFORMAT 16 +uint8_t gemm_ops = GEMM; + +#endif diff --git a/hwpe/redmule_softclear/inc/w_2D.h b/hwpe/redmule_softclear/inc/w_2D.h new file mode 100644 index 0000000..9409c64 --- /dev/null +++ b/hwpe/redmule_softclear/inc/w_2D.h @@ -0,0 +1,35 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t w_inp_2D [32][32] = { +0x311a, 0x39e0, 0x387d, 0x3a4a, 0x386f, 0x3ada, 0x392f, 0x3854, 0x3014, 0x2fd2, 0x31c9, 0x2fca, 0x2e55, 0x3bc8, 0x396d, 0x3b1d, 0x39f6, 0x333a, 0x3908, 0x3628, 0x3bab, 0x3b8b, 0x3b4a, 0x322d, 0x3925, 0x317a, 0x3725, 0x31c2, 0x3066, 0x38f3, 0x3a17, 0x3476, +0x3bda, 0x3196, 0x3922, 0x3680, 0x396a, 0x3021, 0x3761, 0x374d, 0x2fc2, 0x3967, 0x3b94, 0x33b5, 0x3797, 0x34d6, 0x3655, 0x2176, 0x39bc, 0x3999, 0x3658, 0x3904, 0x3759, 0x2ade, 0x3a5a, 0x3b78, 0x36c7, 0x2d01, 0x3b58, 0x2d9a, 0x373d, 0x3952, 0x38e8, 0x3887, +0x37b6, 0x3a88, 0x2f8a, 0x2d79, 0x3413, 0x3421, 0x3976, 0x32b2, 0x3446, 0x2d99, 0x3a56, 0x3322, 0x3b49, 0x39fa, 0x3acd, 0x3af6, 0x304c, 0x3abb, 0x3a83, 0x38b2, 0x3ab9, 0x363e, 0x389f, 0x31bb, 0x38e1, 0x3bc4, 0x3b9b, 0x2984, 0x3a43, 0x3b2f, 0x35d6, 0x3bda, +0x2df3, 0x3bf8, 0x2acc, 0x378b, 0x3555, 0x2e59, 0x31d4, 0x34ec, 0x3a46, 0x3bab, 0x3214, 0x3161, 0x3470, 0x3a03, 0x368e, 0x31ad, 0x27cb, 0x2ecb, 0x3422, 0x39f7, 0x3644, 0x3a77, 0x313f, 0x34f2, 0x39b3, 0x3bf2, 0x379a, 0x3456, 0x35fe, 0x3ae7, 0x3964, 0x385f, +0x3b16, 0x3999, 0x3833, 0x2eda, 0x3afd, 0x3a4a, 0x3ba2, 0x2bd4, 0x3b38, 0x31a2, 0x32dd, 0x353c, 0x366f, 0x375e, 0x3821, 0x367a, 0x3b44, 0x39e6, 0x3787, 0x339e, 0x39d7, 0x38c6, 0x37d5, 0x342f, 0x3984, 0x319b, 0x33b5, 0x35ab, 0x398a, 0x374e, 0x36b6, 0x3b21, +0x3bbb, 0x2ab3, 0x2ad5, 0x33bc, 0x2bef, 0x3780, 0x3738, 0x3a0b, 0x3b09, 0x30ca, 0x384e, 0x3ab3, 0x39bd, 0x3453, 0x3a6d, 0x3957, 0x2c10, 0x30e9, 0x35d4, 0x3aef, 0x3be9, 0x39ad, 0x3a74, 0x3af9, 0x3739, 0x2d4d, 0x39fe, 0x3b72, 0x2c57, 0x398c, 0x381f, 0x3930, +0x3820, 0x321b, 0x3964, 0x2964, 0x33a0, 0x2d00, 0x2490, 0x336b, 0x3465, 0x3b2e, 0x3aa0, 0x371f, 0x300e, 0x3a09, 0x3bf1, 0x25cc, 0x3b6f, 0x3384, 0x3a88, 0x3acb, 0x3814, 0x36d0, 0x3081, 0x3a2c, 0x3353, 0x39cb, 0x31ed, 0x3af6, 0x3721, 0x36c7, 0x2ce2, 0x390d, +0x3698, 0x3ab2, 0x3b3e, 0x2eb4, 0x3998, 0x39e3, 0x3a77, 0x3632, 0x2c12, 0x3bd5, 0x3ba3, 0x3bba, 0x323c, 0x367b, 0x3557, 0x39c8, 0x37db, 0x3b45, 0x3b6e, 0x3931, 0x3121, 0x3a8d, 0x3a55, 0x3b9b, 0x358a, 0x3925, 0x3491, 0x3912, 0x3b6b, 0x3584, 0x32df, 0x3120, +0x32b2, 0x3b0a, 0x2cad, 0x3465, 0x3ad3, 0x3bcd, 0x363b, 0x3afe, 0x354b, 0x3374, 0x39af, 0x3b7f, 0x308c, 0x2e72, 0x3380, 0x3b70, 0x3902, 0x38d8, 0x39f3, 0x3a4b, 0x3853, 0x397b, 0x2ebe, 0x387f, 0x2845, 0x37e2, 0x360f, 0x370b, 0x3acb, 0x35d4, 0x36e6, 0x3262, +0x2e88, 0x3a54, 0x2ee3, 0x3575, 0x3afe, 0x2aee, 0x39a0, 0x3aae, 0x3693, 0x3432, 0x3834, 0x3b9b, 0x3bcb, 0x2e3a, 0x356d, 0x374e, 0x3924, 0x383c, 0x311e, 0x3ac5, 0x352d, 0x311e, 0x38ca, 0x34d4, 0x36ca, 0x34ed, 0x3a13, 0x33eb, 0x3639, 0x3828, 0x3b3c, 0x3939, +0x3837, 0x3521, 0x2cb5, 0x3629, 0x3924, 0x384c, 0x366a, 0x3bbf, 0x2e9e, 0x3ba8, 0x33ad, 0x38c8, 0x3934, 0x3907, 0x249a, 0x3690, 0x3a09, 0x3215, 0x3898, 0x325d, 0x37d5, 0x3195, 0x361c, 0x3ae4, 0x351f, 0x3452, 0x3bc0, 0x375c, 0x39bf, 0x317a, 0x3aae, 0x283a, +0x3476, 0x3b92, 0x3472, 0x383e, 0x280f, 0x39d6, 0x2fd1, 0x31f4, 0x2ffb, 0x3b97, 0x3692, 0x36c0, 0x3989, 0x33cf, 0x3ba6, 0x3239, 0x35d7, 0x33ab, 0x31eb, 0x3b47, 0x389b, 0x3b88, 0x3580, 0x354c, 0x3802, 0x3b9a, 0x3b94, 0x2a92, 0x2db1, 0x38bd, 0x2dfb, 0x3900, +0x344f, 0x3739, 0x27a5, 0x3b2e, 0x342b, 0x34bb, 0x30c8, 0x3ae8, 0x3b26, 0x3982, 0x38c0, 0x3408, 0x38c8, 0x36ef, 0x3bf0, 0x3acf, 0x3a3c, 0x3825, 0x31a5, 0x3ada, 0x3b5b, 0x37db, 0x3a01, 0x3663, 0x3a7d, 0x327b, 0x3a1f, 0x3862, 0x38af, 0x3204, 0x372e, 0x3b19, +0x3708, 0x3622, 0x2e62, 0x39ab, 0x2d4d, 0x31b4, 0x3552, 0x3bbc, 0x36f2, 0x36eb, 0x38ef, 0x3755, 0x3bbe, 0x2c17, 0x3815, 0x2f53, 0x363f, 0x38c1, 0x3246, 0x386b, 0x34de, 0x34e4, 0x3baa, 0x349e, 0x32ce, 0x3a68, 0x373f, 0x2cce, 0x3b36, 0x28ba, 0x3b50, 0x3232, +0x1f34, 0x3928, 0x35cd, 0x3b38, 0x30ce, 0x35a1, 0x3a06, 0x3a32, 0x3a53, 0x3489, 0x3241, 0x372f, 0x390c, 0x3a1b, 0x378a, 0x3713, 0x3769, 0x37a8, 0x3418, 0x3ad4, 0x3a4e, 0x3bf7, 0x37a5, 0x34dc, 0x39b2, 0x351b, 0x3372, 0x349f, 0x2f50, 0x3ab1, 0x3795, 0x2db7, +0x3864, 0x3157, 0x3900, 0x323e, 0x389e, 0x3880, 0x3b1f, 0x37a1, 0x396c, 0x2e43, 0x2c2a, 0x3b78, 0x3988, 0x3a14, 0x39c1, 0x3b51, 0x3780, 0x3bf2, 0x2d19, 0x3815, 0x3a5f, 0x3641, 0x2f62, 0x37d5, 0x3564, 0x139a, 0x3ab8, 0x28f7, 0x3785, 0x34e1, 0x3097, 0x3768, +0x3971, 0x3ae2, 0x32ae, 0x2fd5, 0x382a, 0x346c, 0x3133, 0x3167, 0x3940, 0x2d12, 0x389a, 0x3bd0, 0x3943, 0x391c, 0x3a75, 0x2a11, 0x391e, 0x372d, 0x3a79, 0x3b72, 0x3373, 0x39b7, 0x35d7, 0x372b, 0x3a6d, 0x38a1, 0x3279, 0x3434, 0x3694, 0x3b45, 0x3abb, 0x392d, +0x34a8, 0x3757, 0x32ca, 0x345d, 0x36a5, 0x3854, 0x2dcd, 0x30af, 0x38dd, 0x3067, 0x3411, 0x3997, 0x397a, 0x3a64, 0x38b8, 0x3962, 0x3509, 0x3bb6, 0x3a66, 0x339f, 0x372a, 0x31a8, 0x37da, 0x36ff, 0x33c6, 0x31da, 0x3977, 0x3b72, 0x3841, 0x3567, 0x3433, 0x33b8, +0x39fe, 0x3a10, 0x3bf2, 0x35e7, 0x3a4a, 0x3b3e, 0x2ec7, 0x3aa4, 0x3846, 0x3af9, 0x38a9, 0x2c1f, 0x39ab, 0x349f, 0x31d6, 0x39ae, 0x3b79, 0x352d, 0x3516, 0x347c, 0x2f33, 0x35ad, 0x31c4, 0x3b52, 0x354b, 0x3786, 0x3ab7, 0x3896, 0x34ac, 0x352f, 0x37e6, 0x326a, +0x2e44, 0x34c7, 0x388d, 0x3bf4, 0x363f, 0x3b3d, 0x33b1, 0x3b8b, 0x3340, 0x37f7, 0x3b07, 0x25bf, 0x398e, 0x3505, 0x3bd7, 0x366d, 0x388a, 0x2cc0, 0x359a, 0x3b9a, 0x3b99, 0x379d, 0x3b6b, 0x39b8, 0x3223, 0x2703, 0x3ba9, 0x2ecb, 0x3759, 0x39d8, 0x37ac, 0x32cf, +0x35f2, 0x38a3, 0x399e, 0x3bd2, 0x3780, 0x3af3, 0x3b5e, 0x337b, 0x3a08, 0x35da, 0x3446, 0x3b25, 0x3ad0, 0x3bee, 0x3141, 0x32d8, 0x34ce, 0x2ac9, 0x3800, 0x3a8a, 0x2d53, 0x368a, 0x3561, 0x3998, 0x35a3, 0x3677, 0x3ab2, 0x3269, 0x3236, 0x3b3e, 0x3aba, 0x3bac, +0x395d, 0x3820, 0x1df6, 0x3bb5, 0x35b5, 0x3675, 0x3b74, 0x360f, 0x34de, 0x3a0c, 0x3aeb, 0x299d, 0x3207, 0x3bd8, 0x2178, 0x3995, 0x3948, 0x3908, 0x3843, 0x2ea5, 0x3045, 0x3989, 0x345d, 0x39c5, 0x3a89, 0x3863, 0x3be0, 0x397a, 0x38f1, 0x39e2, 0x3b08, 0x352e, +0x385f, 0x28f2, 0x3bc3, 0x35e0, 0x380c, 0x3b9c, 0x3afc, 0x390a, 0x3689, 0x34fd, 0x2cf5, 0x308e, 0x342b, 0x3921, 0x3a67, 0x3ad6, 0x2986, 0x32fc, 0x35aa, 0x3507, 0x3608, 0x33fd, 0x3bf3, 0x39e2, 0x3b0f, 0x30b7, 0x3896, 0x3ae4, 0x2145, 0x35b6, 0x2e1d, 0x3ad1, +0x333d, 0x3afb, 0x2703, 0x3413, 0x1d7d, 0x3b7f, 0x3ae1, 0x303c, 0x3004, 0x39d3, 0x3554, 0x31a4, 0x354e, 0x3662, 0x39c5, 0x2eb7, 0x2c6e, 0x397f, 0x31d8, 0x1f0c, 0x38e3, 0x35f0, 0x2714, 0x28d1, 0x375e, 0x3a75, 0x3830, 0x3578, 0x397d, 0x3b18, 0x383c, 0x3498, +0x39ad, 0x3598, 0x23c4, 0x34ea, 0x3a61, 0x2b00, 0x3707, 0x3ae1, 0x37ae, 0x389d, 0x37fa, 0x3673, 0x3278, 0xf3e, 0x3809, 0x33c6, 0x3bf5, 0x3279, 0x3816, 0x360c, 0x39c8, 0x381f, 0x3741, 0x2d66, 0x38c0, 0x37d3, 0x377a, 0x3621, 0x2faf, 0x392e, 0x2de6, 0x33c5, +0x3803, 0x2600, 0x32e9, 0x39b4, 0x38d2, 0x34e8, 0x2fe6, 0x3199, 0x3643, 0x3a77, 0x27cc, 0x39d7, 0x34c6, 0x2ea8, 0x364e, 0x3b07, 0x31c7, 0x30a1, 0x31b1, 0x3b8f, 0x3571, 0x3b75, 0x3989, 0x3805, 0x39fb, 0x3945, 0x352b, 0x31d8, 0x3904, 0x3440, 0x3a57, 0x2cf7, +0x3b39, 0x2fcd, 0x2b89, 0x2edd, 0x3682, 0x36a9, 0x32c8, 0x37ac, 0x32a5, 0x3311, 0x394b, 0x3b84, 0x3aec, 0x3601, 0x2765, 0x3b69, 0x396b, 0x3727, 0x3bfe, 0x3907, 0x376f, 0x3674, 0x3973, 0x3671, 0x3491, 0x3993, 0x383f, 0x3335, 0x3989, 0x3550, 0x3077, 0x35f5, +0x3a59, 0x3950, 0x380c, 0x37cd, 0x30bf, 0x3607, 0x3afa, 0x3b5d, 0x32b9, 0x386b, 0x35bd, 0x3aca, 0x3ba5, 0x3b2d, 0x3b19, 0x3b8b, 0x345e, 0x2845, 0x34aa, 0x372a, 0x3448, 0x34f5, 0x3ae2, 0x3637, 0x2cb5, 0x354b, 0x3b15, 0x2ca8, 0x2641, 0x3178, 0x2cfe, 0x39b4, +0x3bdd, 0x3acb, 0x3a05, 0x38a2, 0x3b4a, 0x34e5, 0x395f, 0x394b, 0x34c4, 0x3aa5, 0x29bb, 0x2d96, 0x339d, 0x387c, 0x382e, 0x385a, 0x396b, 0x3aa9, 0x2f1e, 0x33a7, 0x3b90, 0x3b7b, 0x3b5f, 0x39d3, 0x3b18, 0x354f, 0x2cdb, 0x3a6f, 0x3434, 0x34ff, 0x3a5b, 0x3b84, +0x3a33, 0x384b, 0x2e67, 0x3b85, 0x3853, 0x380c, 0x346a, 0x3aaa, 0x3492, 0x33e8, 0x3bf2, 0x38ae, 0x3a29, 0x3830, 0x3221, 0x35b1, 0x3a48, 0x2c68, 0x2ced, 0x3a7e, 0x3539, 0x3922, 0x374c, 0x3aaa, 0x2dae, 0x395d, 0x3b3d, 0x3890, 0x2cfe, 0x2dd6, 0x3bad, 0x33c5, +0x2c07, 0x3a2c, 0x37a8, 0x390f, 0x2fc8, 0x35ae, 0x388c, 0x30ee, 0x3674, 0x391d, 0x3bfc, 0x36bf, 0x322d, 0x3a78, 0x35c0, 0x3492, 0x3ac8, 0x3504, 0x3315, 0x381d, 0x3a7a, 0x3a08, 0x343c, 0x3bda, 0x341b, 0x39f0, 0x3b9e, 0x395d, 0x3c00, 0x38ab, 0x3bcf, 0x3564, +0x33c4, 0x3b0d, 0x3623, 0x33b9, 0x3b92, 0x1e71, 0x2c57, 0x36d0, 0x314b, 0x3a16, 0x3372, 0x341b, 0x3aaa, 0x3444, 0x396b, 0x2dd7, 0x3b30, 0x3559, 0x3b5b, 0x3a29, 0x2d19, 0x38b7, 0x3b01, 0x3afa, 0x398a, 0x3839, 0x3ac9, 0x2e31, 0x3924, 0x39f2, 0x3a7f, 0x3285 +}; \ No newline at end of file diff --git a/hwpe/redmule_softclear/inc/w_input.h b/hwpe/redmule_softclear/inc/w_input.h new file mode 100644 index 0000000..dc4d3be --- /dev/null +++ b/hwpe/redmule_softclear/inc/w_input.h @@ -0,0 +1,35 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t w_inp [1024] = { +0x311a, 0x39e0, 0x387d, 0x3a4a, 0x386f, 0x3ada, 0x392f, 0x3854, 0x3014, 0x2fd2, 0x31c9, 0x2fca, 0x2e55, 0x3bc8, 0x396d, 0x3b1d, 0x39f6, 0x333a, 0x3908, 0x3628, 0x3bab, 0x3b8b, 0x3b4a, 0x322d, 0x3925, 0x317a, 0x3725, 0x31c2, 0x3066, 0x38f3, 0x3a17, 0x3476, +0x3bda, 0x3196, 0x3922, 0x3680, 0x396a, 0x3021, 0x3761, 0x374d, 0x2fc2, 0x3967, 0x3b94, 0x33b5, 0x3797, 0x34d6, 0x3655, 0x2176, 0x39bc, 0x3999, 0x3658, 0x3904, 0x3759, 0x2ade, 0x3a5a, 0x3b78, 0x36c7, 0x2d01, 0x3b58, 0x2d9a, 0x373d, 0x3952, 0x38e8, 0x3887, +0x37b6, 0x3a88, 0x2f8a, 0x2d79, 0x3413, 0x3421, 0x3976, 0x32b2, 0x3446, 0x2d99, 0x3a56, 0x3322, 0x3b49, 0x39fa, 0x3acd, 0x3af6, 0x304c, 0x3abb, 0x3a83, 0x38b2, 0x3ab9, 0x363e, 0x389f, 0x31bb, 0x38e1, 0x3bc4, 0x3b9b, 0x2984, 0x3a43, 0x3b2f, 0x35d6, 0x3bda, +0x2df3, 0x3bf8, 0x2acc, 0x378b, 0x3555, 0x2e59, 0x31d4, 0x34ec, 0x3a46, 0x3bab, 0x3214, 0x3161, 0x3470, 0x3a03, 0x368e, 0x31ad, 0x27cb, 0x2ecb, 0x3422, 0x39f7, 0x3644, 0x3a77, 0x313f, 0x34f2, 0x39b3, 0x3bf2, 0x379a, 0x3456, 0x35fe, 0x3ae7, 0x3964, 0x385f, +0x3b16, 0x3999, 0x3833, 0x2eda, 0x3afd, 0x3a4a, 0x3ba2, 0x2bd4, 0x3b38, 0x31a2, 0x32dd, 0x353c, 0x366f, 0x375e, 0x3821, 0x367a, 0x3b44, 0x39e6, 0x3787, 0x339e, 0x39d7, 0x38c6, 0x37d5, 0x342f, 0x3984, 0x319b, 0x33b5, 0x35ab, 0x398a, 0x374e, 0x36b6, 0x3b21, +0x3bbb, 0x2ab3, 0x2ad5, 0x33bc, 0x2bef, 0x3780, 0x3738, 0x3a0b, 0x3b09, 0x30ca, 0x384e, 0x3ab3, 0x39bd, 0x3453, 0x3a6d, 0x3957, 0x2c10, 0x30e9, 0x35d4, 0x3aef, 0x3be9, 0x39ad, 0x3a74, 0x3af9, 0x3739, 0x2d4d, 0x39fe, 0x3b72, 0x2c57, 0x398c, 0x381f, 0x3930, +0x3820, 0x321b, 0x3964, 0x2964, 0x33a0, 0x2d00, 0x2490, 0x336b, 0x3465, 0x3b2e, 0x3aa0, 0x371f, 0x300e, 0x3a09, 0x3bf1, 0x25cc, 0x3b6f, 0x3384, 0x3a88, 0x3acb, 0x3814, 0x36d0, 0x3081, 0x3a2c, 0x3353, 0x39cb, 0x31ed, 0x3af6, 0x3721, 0x36c7, 0x2ce2, 0x390d, +0x3698, 0x3ab2, 0x3b3e, 0x2eb4, 0x3998, 0x39e3, 0x3a77, 0x3632, 0x2c12, 0x3bd5, 0x3ba3, 0x3bba, 0x323c, 0x367b, 0x3557, 0x39c8, 0x37db, 0x3b45, 0x3b6e, 0x3931, 0x3121, 0x3a8d, 0x3a55, 0x3b9b, 0x358a, 0x3925, 0x3491, 0x3912, 0x3b6b, 0x3584, 0x32df, 0x3120, +0x32b2, 0x3b0a, 0x2cad, 0x3465, 0x3ad3, 0x3bcd, 0x363b, 0x3afe, 0x354b, 0x3374, 0x39af, 0x3b7f, 0x308c, 0x2e72, 0x3380, 0x3b70, 0x3902, 0x38d8, 0x39f3, 0x3a4b, 0x3853, 0x397b, 0x2ebe, 0x387f, 0x2845, 0x37e2, 0x360f, 0x370b, 0x3acb, 0x35d4, 0x36e6, 0x3262, +0x2e88, 0x3a54, 0x2ee3, 0x3575, 0x3afe, 0x2aee, 0x39a0, 0x3aae, 0x3693, 0x3432, 0x3834, 0x3b9b, 0x3bcb, 0x2e3a, 0x356d, 0x374e, 0x3924, 0x383c, 0x311e, 0x3ac5, 0x352d, 0x311e, 0x38ca, 0x34d4, 0x36ca, 0x34ed, 0x3a13, 0x33eb, 0x3639, 0x3828, 0x3b3c, 0x3939, +0x3837, 0x3521, 0x2cb5, 0x3629, 0x3924, 0x384c, 0x366a, 0x3bbf, 0x2e9e, 0x3ba8, 0x33ad, 0x38c8, 0x3934, 0x3907, 0x249a, 0x3690, 0x3a09, 0x3215, 0x3898, 0x325d, 0x37d5, 0x3195, 0x361c, 0x3ae4, 0x351f, 0x3452, 0x3bc0, 0x375c, 0x39bf, 0x317a, 0x3aae, 0x283a, +0x3476, 0x3b92, 0x3472, 0x383e, 0x280f, 0x39d6, 0x2fd1, 0x31f4, 0x2ffb, 0x3b97, 0x3692, 0x36c0, 0x3989, 0x33cf, 0x3ba6, 0x3239, 0x35d7, 0x33ab, 0x31eb, 0x3b47, 0x389b, 0x3b88, 0x3580, 0x354c, 0x3802, 0x3b9a, 0x3b94, 0x2a92, 0x2db1, 0x38bd, 0x2dfb, 0x3900, +0x344f, 0x3739, 0x27a5, 0x3b2e, 0x342b, 0x34bb, 0x30c8, 0x3ae8, 0x3b26, 0x3982, 0x38c0, 0x3408, 0x38c8, 0x36ef, 0x3bf0, 0x3acf, 0x3a3c, 0x3825, 0x31a5, 0x3ada, 0x3b5b, 0x37db, 0x3a01, 0x3663, 0x3a7d, 0x327b, 0x3a1f, 0x3862, 0x38af, 0x3204, 0x372e, 0x3b19, +0x3708, 0x3622, 0x2e62, 0x39ab, 0x2d4d, 0x31b4, 0x3552, 0x3bbc, 0x36f2, 0x36eb, 0x38ef, 0x3755, 0x3bbe, 0x2c17, 0x3815, 0x2f53, 0x363f, 0x38c1, 0x3246, 0x386b, 0x34de, 0x34e4, 0x3baa, 0x349e, 0x32ce, 0x3a68, 0x373f, 0x2cce, 0x3b36, 0x28ba, 0x3b50, 0x3232, +0x1f34, 0x3928, 0x35cd, 0x3b38, 0x30ce, 0x35a1, 0x3a06, 0x3a32, 0x3a53, 0x3489, 0x3241, 0x372f, 0x390c, 0x3a1b, 0x378a, 0x3713, 0x3769, 0x37a8, 0x3418, 0x3ad4, 0x3a4e, 0x3bf7, 0x37a5, 0x34dc, 0x39b2, 0x351b, 0x3372, 0x349f, 0x2f50, 0x3ab1, 0x3795, 0x2db7, +0x3864, 0x3157, 0x3900, 0x323e, 0x389e, 0x3880, 0x3b1f, 0x37a1, 0x396c, 0x2e43, 0x2c2a, 0x3b78, 0x3988, 0x3a14, 0x39c1, 0x3b51, 0x3780, 0x3bf2, 0x2d19, 0x3815, 0x3a5f, 0x3641, 0x2f62, 0x37d5, 0x3564, 0x139a, 0x3ab8, 0x28f7, 0x3785, 0x34e1, 0x3097, 0x3768, +0x3971, 0x3ae2, 0x32ae, 0x2fd5, 0x382a, 0x346c, 0x3133, 0x3167, 0x3940, 0x2d12, 0x389a, 0x3bd0, 0x3943, 0x391c, 0x3a75, 0x2a11, 0x391e, 0x372d, 0x3a79, 0x3b72, 0x3373, 0x39b7, 0x35d7, 0x372b, 0x3a6d, 0x38a1, 0x3279, 0x3434, 0x3694, 0x3b45, 0x3abb, 0x392d, +0x34a8, 0x3757, 0x32ca, 0x345d, 0x36a5, 0x3854, 0x2dcd, 0x30af, 0x38dd, 0x3067, 0x3411, 0x3997, 0x397a, 0x3a64, 0x38b8, 0x3962, 0x3509, 0x3bb6, 0x3a66, 0x339f, 0x372a, 0x31a8, 0x37da, 0x36ff, 0x33c6, 0x31da, 0x3977, 0x3b72, 0x3841, 0x3567, 0x3433, 0x33b8, +0x39fe, 0x3a10, 0x3bf2, 0x35e7, 0x3a4a, 0x3b3e, 0x2ec7, 0x3aa4, 0x3846, 0x3af9, 0x38a9, 0x2c1f, 0x39ab, 0x349f, 0x31d6, 0x39ae, 0x3b79, 0x352d, 0x3516, 0x347c, 0x2f33, 0x35ad, 0x31c4, 0x3b52, 0x354b, 0x3786, 0x3ab7, 0x3896, 0x34ac, 0x352f, 0x37e6, 0x326a, +0x2e44, 0x34c7, 0x388d, 0x3bf4, 0x363f, 0x3b3d, 0x33b1, 0x3b8b, 0x3340, 0x37f7, 0x3b07, 0x25bf, 0x398e, 0x3505, 0x3bd7, 0x366d, 0x388a, 0x2cc0, 0x359a, 0x3b9a, 0x3b99, 0x379d, 0x3b6b, 0x39b8, 0x3223, 0x2703, 0x3ba9, 0x2ecb, 0x3759, 0x39d8, 0x37ac, 0x32cf, +0x35f2, 0x38a3, 0x399e, 0x3bd2, 0x3780, 0x3af3, 0x3b5e, 0x337b, 0x3a08, 0x35da, 0x3446, 0x3b25, 0x3ad0, 0x3bee, 0x3141, 0x32d8, 0x34ce, 0x2ac9, 0x3800, 0x3a8a, 0x2d53, 0x368a, 0x3561, 0x3998, 0x35a3, 0x3677, 0x3ab2, 0x3269, 0x3236, 0x3b3e, 0x3aba, 0x3bac, +0x395d, 0x3820, 0x1df6, 0x3bb5, 0x35b5, 0x3675, 0x3b74, 0x360f, 0x34de, 0x3a0c, 0x3aeb, 0x299d, 0x3207, 0x3bd8, 0x2178, 0x3995, 0x3948, 0x3908, 0x3843, 0x2ea5, 0x3045, 0x3989, 0x345d, 0x39c5, 0x3a89, 0x3863, 0x3be0, 0x397a, 0x38f1, 0x39e2, 0x3b08, 0x352e, +0x385f, 0x28f2, 0x3bc3, 0x35e0, 0x380c, 0x3b9c, 0x3afc, 0x390a, 0x3689, 0x34fd, 0x2cf5, 0x308e, 0x342b, 0x3921, 0x3a67, 0x3ad6, 0x2986, 0x32fc, 0x35aa, 0x3507, 0x3608, 0x33fd, 0x3bf3, 0x39e2, 0x3b0f, 0x30b7, 0x3896, 0x3ae4, 0x2145, 0x35b6, 0x2e1d, 0x3ad1, +0x333d, 0x3afb, 0x2703, 0x3413, 0x1d7d, 0x3b7f, 0x3ae1, 0x303c, 0x3004, 0x39d3, 0x3554, 0x31a4, 0x354e, 0x3662, 0x39c5, 0x2eb7, 0x2c6e, 0x397f, 0x31d8, 0x1f0c, 0x38e3, 0x35f0, 0x2714, 0x28d1, 0x375e, 0x3a75, 0x3830, 0x3578, 0x397d, 0x3b18, 0x383c, 0x3498, +0x39ad, 0x3598, 0x23c4, 0x34ea, 0x3a61, 0x2b00, 0x3707, 0x3ae1, 0x37ae, 0x389d, 0x37fa, 0x3673, 0x3278, 0xf3e, 0x3809, 0x33c6, 0x3bf5, 0x3279, 0x3816, 0x360c, 0x39c8, 0x381f, 0x3741, 0x2d66, 0x38c0, 0x37d3, 0x377a, 0x3621, 0x2faf, 0x392e, 0x2de6, 0x33c5, +0x3803, 0x2600, 0x32e9, 0x39b4, 0x38d2, 0x34e8, 0x2fe6, 0x3199, 0x3643, 0x3a77, 0x27cc, 0x39d7, 0x34c6, 0x2ea8, 0x364e, 0x3b07, 0x31c7, 0x30a1, 0x31b1, 0x3b8f, 0x3571, 0x3b75, 0x3989, 0x3805, 0x39fb, 0x3945, 0x352b, 0x31d8, 0x3904, 0x3440, 0x3a57, 0x2cf7, +0x3b39, 0x2fcd, 0x2b89, 0x2edd, 0x3682, 0x36a9, 0x32c8, 0x37ac, 0x32a5, 0x3311, 0x394b, 0x3b84, 0x3aec, 0x3601, 0x2765, 0x3b69, 0x396b, 0x3727, 0x3bfe, 0x3907, 0x376f, 0x3674, 0x3973, 0x3671, 0x3491, 0x3993, 0x383f, 0x3335, 0x3989, 0x3550, 0x3077, 0x35f5, +0x3a59, 0x3950, 0x380c, 0x37cd, 0x30bf, 0x3607, 0x3afa, 0x3b5d, 0x32b9, 0x386b, 0x35bd, 0x3aca, 0x3ba5, 0x3b2d, 0x3b19, 0x3b8b, 0x345e, 0x2845, 0x34aa, 0x372a, 0x3448, 0x34f5, 0x3ae2, 0x3637, 0x2cb5, 0x354b, 0x3b15, 0x2ca8, 0x2641, 0x3178, 0x2cfe, 0x39b4, +0x3bdd, 0x3acb, 0x3a05, 0x38a2, 0x3b4a, 0x34e5, 0x395f, 0x394b, 0x34c4, 0x3aa5, 0x29bb, 0x2d96, 0x339d, 0x387c, 0x382e, 0x385a, 0x396b, 0x3aa9, 0x2f1e, 0x33a7, 0x3b90, 0x3b7b, 0x3b5f, 0x39d3, 0x3b18, 0x354f, 0x2cdb, 0x3a6f, 0x3434, 0x34ff, 0x3a5b, 0x3b84, +0x3a33, 0x384b, 0x2e67, 0x3b85, 0x3853, 0x380c, 0x346a, 0x3aaa, 0x3492, 0x33e8, 0x3bf2, 0x38ae, 0x3a29, 0x3830, 0x3221, 0x35b1, 0x3a48, 0x2c68, 0x2ced, 0x3a7e, 0x3539, 0x3922, 0x374c, 0x3aaa, 0x2dae, 0x395d, 0x3b3d, 0x3890, 0x2cfe, 0x2dd6, 0x3bad, 0x33c5, +0x2c07, 0x3a2c, 0x37a8, 0x390f, 0x2fc8, 0x35ae, 0x388c, 0x30ee, 0x3674, 0x391d, 0x3bfc, 0x36bf, 0x322d, 0x3a78, 0x35c0, 0x3492, 0x3ac8, 0x3504, 0x3315, 0x381d, 0x3a7a, 0x3a08, 0x343c, 0x3bda, 0x341b, 0x39f0, 0x3b9e, 0x395d, 0x3c00, 0x38ab, 0x3bcf, 0x3564, +0x33c4, 0x3b0d, 0x3623, 0x33b9, 0x3b92, 0x1e71, 0x2c57, 0x36d0, 0x314b, 0x3a16, 0x3372, 0x341b, 0x3aaa, 0x3444, 0x396b, 0x2dd7, 0x3b30, 0x3559, 0x3b5b, 0x3a29, 0x2d19, 0x38b7, 0x3b01, 0x3afa, 0x398a, 0x3839, 0x3ac9, 0x2e31, 0x3924, 0x39f2, 0x3a7f, 0x3285 +}; \ No newline at end of file diff --git a/hwpe/redmule_softclear/inc/x_2D.h b/hwpe/redmule_softclear/inc/x_2D.h new file mode 100644 index 0000000..0b589f8 --- /dev/null +++ b/hwpe/redmule_softclear/inc/x_2D.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t x_inp_2D [24][32] = { +0x2153, 0x3bb5, 0x3896, 0x365f, 0x2483, 0x3518, 0x2dd1, 0x3bca, 0x397b, 0x29b1, 0x3705, 0x36c8, 0x398b, 0x3661, 0x2f05, 0x365a, 0x3bf9, 0x34df, 0x363b, 0x38d9, 0x39c6, 0x3abb, 0x3952, 0x38f2, 0x392d, 0x3b3e, 0x2afb, 0x3a9d, 0x353b, 0x3b73, 0x3a01, 0x3679, +0x3934, 0x397d, 0x2904, 0x3822, 0x3462, 0x3b44, 0x39e9, 0x28be, 0x331e, 0x3a1d, 0x39e5, 0x34da, 0x3a19, 0x3906, 0x1d35, 0x3871, 0x31e7, 0x3b29, 0x325d, 0x3797, 0x2b2f, 0x38b4, 0x232f, 0x38aa, 0x3aca, 0x316f, 0x3811, 0x3950, 0x32ea, 0x3bc7, 0x382c, 0x38a2, +0x29ce, 0x3afa, 0x3a39, 0x2ccc, 0x39fd, 0x3b3d, 0x384a, 0x3a35, 0x3802, 0x366a, 0x37ec, 0x3598, 0x3bf8, 0x3a85, 0x3a1b, 0x386e, 0x3b4c, 0x39de, 0x38c2, 0x2f93, 0x3b4c, 0x39c4, 0x3b9e, 0x3844, 0x346d, 0x3bff, 0x32ce, 0x296d, 0x3130, 0x3b3d, 0x3b44, 0x369d, +0x3b13, 0x31ed, 0x330a, 0x3831, 0x34e7, 0x37b3, 0x331a, 0x3918, 0x32d3, 0x3995, 0x3991, 0x3919, 0x3a26, 0x385b, 0x2b76, 0x3a3b, 0x37f2, 0x26a7, 0x3225, 0x3b64, 0x28f0, 0x3456, 0x3822, 0x341e, 0x381a, 0x38d8, 0x2c11, 0x33be, 0x33ac, 0x353f, 0x3476, 0x3abc, +0x36ec, 0x3a1d, 0x39d3, 0x3821, 0x36ac, 0x3bce, 0x3ad2, 0x3616, 0x36a1, 0x2cb3, 0x38d2, 0x314f, 0x385c, 0x3b63, 0x3bb6, 0x2951, 0x372d, 0x2c42, 0x3823, 0x3883, 0x3872, 0x31ee, 0x36c5, 0x399a, 0x31b0, 0x3887, 0x3884, 0x3865, 0x3896, 0x36c3, 0x32e3, 0x346c, +0x3935, 0x3b50, 0x2b6d, 0x38cd, 0x388f, 0x3389, 0x395d, 0x31cd, 0x2efd, 0x3154, 0x2f35, 0x3444, 0x3293, 0x3b6b, 0x1bec, 0x3b69, 0x3bf3, 0x3611, 0x3508, 0x3742, 0x3a50, 0x3ab7, 0x3457, 0x38d3, 0x3344, 0x38e8, 0x33c0, 0x3668, 0x3bee, 0x3b21, 0x3727, 0x3121, +0x316c, 0x3288, 0x2d50, 0x2e74, 0x35d5, 0x37e2, 0x303d, 0x36af, 0x341f, 0x3436, 0x2df7, 0x399d, 0x30f4, 0x3aaf, 0x34e4, 0x2c2a, 0x3116, 0x34d3, 0x36ac, 0x35e3, 0x3760, 0x36e1, 0x3ad2, 0x3547, 0x38f4, 0x369c, 0x3ba9, 0x34f0, 0x3a39, 0x3b19, 0x36e6, 0x395d, +0x3be8, 0x3293, 0x3bfc, 0x3435, 0x2eb3, 0x3360, 0x3919, 0x3bed, 0x396a, 0x37fc, 0x3242, 0x384b, 0x38cb, 0x3b2c, 0x3b28, 0x28cf, 0x3828, 0x3855, 0x3ba9, 0x2fa7, 0x340b, 0x32f1, 0x3ada, 0x36fa, 0x31f5, 0x3436, 0x29d0, 0x33e6, 0x3232, 0x3bec, 0x3904, 0x2797, +0x3b81, 0x3bac, 0x38d2, 0x343d, 0x31af, 0x3b1e, 0x33fc, 0x3864, 0x3624, 0x3905, 0x2945, 0x3b52, 0x2d08, 0x3a17, 0x3b84, 0x3804, 0x3a24, 0x38a3, 0x3562, 0x3ae6, 0x3bba, 0x3a45, 0x3679, 0x31fa, 0x3994, 0x2c3d, 0x383f, 0x399d, 0x34f7, 0x360e, 0x35f3, 0x38f0, +0x38d4, 0x399a, 0x3a48, 0x3987, 0x3b54, 0x382c, 0x3210, 0x35ef, 0x36ca, 0x31b4, 0x3625, 0x371f, 0x37bd, 0x3680, 0x3a3a, 0x3ac0, 0x3bbf, 0x3bf5, 0x39f2, 0x29c2, 0x363e, 0x3a4e, 0x3596, 0x3b1b, 0x3459, 0x3669, 0x3aa1, 0x39c3, 0x3376, 0x390d, 0x2456, 0x39b5, +0x3a66, 0x3ad8, 0x3b51, 0x36aa, 0x32be, 0x3ac8, 0x392b, 0x3740, 0x3a48, 0x38f5, 0x3b2d, 0x3a5f, 0x2ff3, 0x366f, 0x39d3, 0x35e5, 0x3822, 0x38db, 0x3b8a, 0x34be, 0x2d33, 0x36dd, 0x3578, 0x3bdf, 0x2c7e, 0x39cf, 0x32ff, 0x35c9, 0x3970, 0x3bcb, 0x351e, 0x3956, +0x2c42, 0x3308, 0x377a, 0x361c, 0x39a0, 0x36c9, 0x2dcb, 0x3bf2, 0x3b5f, 0x33ee, 0x24c1, 0x2ce9, 0x3927, 0x305d, 0x3702, 0x3119, 0x35f9, 0x3855, 0x3374, 0x349b, 0x3bcf, 0x2dea, 0x34f0, 0x363f, 0x37da, 0x3a74, 0x35fc, 0x35fa, 0x316b, 0x3804, 0x37a7, 0x3986, +0x3073, 0x3aed, 0x31c7, 0x3844, 0x34a4, 0x387d, 0x3a20, 0x3037, 0x3a00, 0x3b70, 0x377f, 0x3686, 0x3b7e, 0x38b3, 0x32e3, 0x3323, 0x391e, 0x3228, 0x3930, 0x3997, 0x3a5e, 0x398b, 0x3512, 0x35b0, 0x365c, 0x325d, 0x3b61, 0x38b8, 0x39a4, 0x3423, 0x3bd7, 0x38af, +0x2d3d, 0x382d, 0x38ac, 0x26ca, 0x395e, 0x21a8, 0x3520, 0x386f, 0x3b95, 0x32c0, 0x3b84, 0x3a51, 0x3b4b, 0x31d2, 0x3747, 0x3b96, 0x3b40, 0x3535, 0x38d1, 0x3899, 0x3b00, 0x3827, 0x3ae3, 0x38c8, 0x3a07, 0x338d, 0x2e96, 0x3a46, 0x394a, 0x39de, 0x2951, 0x3a02, +0x3838, 0x2d45, 0x28c0, 0x3958, 0x3070, 0x2aa2, 0x3510, 0x38ce, 0x271c, 0x3440, 0x3954, 0x30bc, 0x3b35, 0x2f1d, 0x3afb, 0x2dae, 0x356f, 0x2e13, 0x3981, 0x326d, 0x3a28, 0x3a36, 0x3a95, 0x38cb, 0x38db, 0x3150, 0x2c9e, 0x34c5, 0x3adb, 0x3bdf, 0x38f2, 0x3994, +0x36f8, 0x31c0, 0x3a4f, 0x3825, 0x394b, 0x3a8b, 0x38ac, 0x3167, 0x2e2d, 0x3a93, 0x34f3, 0x37bd, 0x3b63, 0x2f2f, 0x3ae0, 0x3ad8, 0x34a8, 0x2e1c, 0x3890, 0x3705, 0x3b69, 0x3bc1, 0x28af, 0x3b36, 0x348b, 0x3111, 0x3a8d, 0x389c, 0x3916, 0x36dc, 0x3bae, 0x3874, +0x3593, 0x3638, 0x3018, 0x3a56, 0x38a3, 0x2ad4, 0x3a25, 0x38d7, 0x3864, 0x31c1, 0x28d1, 0x39c8, 0x37d6, 0x2c7f, 0x3ba5, 0x34b8, 0x3bef, 0x3b83, 0x3ab5, 0x3062, 0x38bc, 0x399c, 0x2ce4, 0x2f2c, 0x39bf, 0x2ed1, 0x385f, 0x37e0, 0x35ee, 0x397d, 0x3b0c, 0x3049, +0x39d5, 0x322e, 0x3936, 0x3747, 0x2e15, 0x3b41, 0x3874, 0x3bd0, 0x2c04, 0x3800, 0x375b, 0x3b2d, 0x38d8, 0x3a51, 0x3406, 0x38da, 0x38ba, 0x3497, 0x382e, 0x35fc, 0x39d4, 0x3775, 0x3b1e, 0x3813, 0x3649, 0x31af, 0x37bb, 0x334a, 0x3a6e, 0x3284, 0x26e0, 0x2e01, +0x2ebb, 0x344b, 0x3821, 0x381a, 0x385a, 0x2534, 0x3635, 0x2a92, 0x3b8c, 0x31f0, 0x3947, 0x3ac7, 0x3743, 0x3924, 0x39e4, 0x358f, 0x2b62, 0x392c, 0x3955, 0x3341, 0x3676, 0x38ac, 0x3957, 0x335b, 0x2ca2, 0x39ff, 0x37cb, 0x341f, 0x3ac9, 0x3b6c, 0x2f14, 0x34c3, +0x3018, 0x3169, 0x355b, 0x3624, 0x31ed, 0x379e, 0x3268, 0x309b, 0x35db, 0x3872, 0x3bdb, 0x34c7, 0x3408, 0x3359, 0x3920, 0x331f, 0x3866, 0x3af0, 0x2a1a, 0x39e0, 0x3b14, 0x34fa, 0x2d18, 0x3963, 0x35e8, 0x2539, 0x38f5, 0x37b3, 0x378f, 0x31b5, 0x3a6c, 0x3685, +0x3a06, 0x318a, 0x2934, 0x33c1, 0x3be8, 0x375b, 0x3860, 0x3543, 0x3702, 0x3951, 0x3677, 0x37ff, 0x2e27, 0x2e3a, 0x340f, 0x3817, 0x2f04, 0x357e, 0x3a1d, 0x2dd6, 0x252a, 0x3945, 0x162a, 0x3b19, 0x3a53, 0x35d2, 0x3a5d, 0x3474, 0x38e9, 0x374b, 0x387c, 0x1f1a, +0x38ac, 0x3291, 0x3393, 0x3b53, 0x3169, 0x3bca, 0x2f1a, 0x3551, 0x38a3, 0x28e3, 0x369d, 0x34a1, 0x38a8, 0x34c3, 0x3841, 0x390d, 0x3b13, 0x3282, 0x3a29, 0x3a78, 0x2df3, 0x3a37, 0x35f4, 0x35a6, 0x38e8, 0x3328, 0x3beb, 0x390b, 0x32dc, 0x34dc, 0x396d, 0x3a78, +0x39ba, 0x3a06, 0x2cdd, 0x3bc3, 0x2d43, 0x2992, 0x3663, 0x3a68, 0x2c3e, 0x394e, 0x2c9f, 0x380e, 0x37f5, 0x3557, 0x2873, 0x390f, 0x39e7, 0x3939, 0x3669, 0x385c, 0x3a68, 0x32c4, 0x2b04, 0x2d6d, 0x39d3, 0x3895, 0x331d, 0x3b59, 0x3463, 0x2b6a, 0x31de, 0x3296, +0x3aae, 0x3bcd, 0x345a, 0x3897, 0x374b, 0x3bd4, 0x38a2, 0x357f, 0x3402, 0x3a0c, 0x3507, 0x3865, 0x3a54, 0x3878, 0x3859, 0x383e, 0x32b5, 0x34ea, 0x328d, 0x38b6, 0x3464, 0x2f5b, 0x35ff, 0x3817, 0x2f24, 0x3533, 0x3b21, 0x37ba, 0x3837, 0x2e34, 0x3bad, 0x34bc +}; \ No newline at end of file diff --git a/hwpe/redmule_softclear/inc/x_input.h b/hwpe/redmule_softclear/inc/x_input.h new file mode 100644 index 0000000..1e38d23 --- /dev/null +++ b/hwpe/redmule_softclear/inc/x_input.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t x_inp [768] = { +0x2153, 0x3bb5, 0x3896, 0x365f, 0x2483, 0x3518, 0x2dd1, 0x3bca, 0x397b, 0x29b1, 0x3705, 0x36c8, 0x398b, 0x3661, 0x2f05, 0x365a, 0x3bf9, 0x34df, 0x363b, 0x38d9, 0x39c6, 0x3abb, 0x3952, 0x38f2, 0x392d, 0x3b3e, 0x2afb, 0x3a9d, 0x353b, 0x3b73, 0x3a01, 0x3679, +0x3934, 0x397d, 0x2904, 0x3822, 0x3462, 0x3b44, 0x39e9, 0x28be, 0x331e, 0x3a1d, 0x39e5, 0x34da, 0x3a19, 0x3906, 0x1d35, 0x3871, 0x31e7, 0x3b29, 0x325d, 0x3797, 0x2b2f, 0x38b4, 0x232f, 0x38aa, 0x3aca, 0x316f, 0x3811, 0x3950, 0x32ea, 0x3bc7, 0x382c, 0x38a2, +0x29ce, 0x3afa, 0x3a39, 0x2ccc, 0x39fd, 0x3b3d, 0x384a, 0x3a35, 0x3802, 0x366a, 0x37ec, 0x3598, 0x3bf8, 0x3a85, 0x3a1b, 0x386e, 0x3b4c, 0x39de, 0x38c2, 0x2f93, 0x3b4c, 0x39c4, 0x3b9e, 0x3844, 0x346d, 0x3bff, 0x32ce, 0x296d, 0x3130, 0x3b3d, 0x3b44, 0x369d, +0x3b13, 0x31ed, 0x330a, 0x3831, 0x34e7, 0x37b3, 0x331a, 0x3918, 0x32d3, 0x3995, 0x3991, 0x3919, 0x3a26, 0x385b, 0x2b76, 0x3a3b, 0x37f2, 0x26a7, 0x3225, 0x3b64, 0x28f0, 0x3456, 0x3822, 0x341e, 0x381a, 0x38d8, 0x2c11, 0x33be, 0x33ac, 0x353f, 0x3476, 0x3abc, +0x36ec, 0x3a1d, 0x39d3, 0x3821, 0x36ac, 0x3bce, 0x3ad2, 0x3616, 0x36a1, 0x2cb3, 0x38d2, 0x314f, 0x385c, 0x3b63, 0x3bb6, 0x2951, 0x372d, 0x2c42, 0x3823, 0x3883, 0x3872, 0x31ee, 0x36c5, 0x399a, 0x31b0, 0x3887, 0x3884, 0x3865, 0x3896, 0x36c3, 0x32e3, 0x346c, +0x3935, 0x3b50, 0x2b6d, 0x38cd, 0x388f, 0x3389, 0x395d, 0x31cd, 0x2efd, 0x3154, 0x2f35, 0x3444, 0x3293, 0x3b6b, 0x1bec, 0x3b69, 0x3bf3, 0x3611, 0x3508, 0x3742, 0x3a50, 0x3ab7, 0x3457, 0x38d3, 0x3344, 0x38e8, 0x33c0, 0x3668, 0x3bee, 0x3b21, 0x3727, 0x3121, +0x316c, 0x3288, 0x2d50, 0x2e74, 0x35d5, 0x37e2, 0x303d, 0x36af, 0x341f, 0x3436, 0x2df7, 0x399d, 0x30f4, 0x3aaf, 0x34e4, 0x2c2a, 0x3116, 0x34d3, 0x36ac, 0x35e3, 0x3760, 0x36e1, 0x3ad2, 0x3547, 0x38f4, 0x369c, 0x3ba9, 0x34f0, 0x3a39, 0x3b19, 0x36e6, 0x395d, +0x3be8, 0x3293, 0x3bfc, 0x3435, 0x2eb3, 0x3360, 0x3919, 0x3bed, 0x396a, 0x37fc, 0x3242, 0x384b, 0x38cb, 0x3b2c, 0x3b28, 0x28cf, 0x3828, 0x3855, 0x3ba9, 0x2fa7, 0x340b, 0x32f1, 0x3ada, 0x36fa, 0x31f5, 0x3436, 0x29d0, 0x33e6, 0x3232, 0x3bec, 0x3904, 0x2797, +0x3b81, 0x3bac, 0x38d2, 0x343d, 0x31af, 0x3b1e, 0x33fc, 0x3864, 0x3624, 0x3905, 0x2945, 0x3b52, 0x2d08, 0x3a17, 0x3b84, 0x3804, 0x3a24, 0x38a3, 0x3562, 0x3ae6, 0x3bba, 0x3a45, 0x3679, 0x31fa, 0x3994, 0x2c3d, 0x383f, 0x399d, 0x34f7, 0x360e, 0x35f3, 0x38f0, +0x38d4, 0x399a, 0x3a48, 0x3987, 0x3b54, 0x382c, 0x3210, 0x35ef, 0x36ca, 0x31b4, 0x3625, 0x371f, 0x37bd, 0x3680, 0x3a3a, 0x3ac0, 0x3bbf, 0x3bf5, 0x39f2, 0x29c2, 0x363e, 0x3a4e, 0x3596, 0x3b1b, 0x3459, 0x3669, 0x3aa1, 0x39c3, 0x3376, 0x390d, 0x2456, 0x39b5, +0x3a66, 0x3ad8, 0x3b51, 0x36aa, 0x32be, 0x3ac8, 0x392b, 0x3740, 0x3a48, 0x38f5, 0x3b2d, 0x3a5f, 0x2ff3, 0x366f, 0x39d3, 0x35e5, 0x3822, 0x38db, 0x3b8a, 0x34be, 0x2d33, 0x36dd, 0x3578, 0x3bdf, 0x2c7e, 0x39cf, 0x32ff, 0x35c9, 0x3970, 0x3bcb, 0x351e, 0x3956, +0x2c42, 0x3308, 0x377a, 0x361c, 0x39a0, 0x36c9, 0x2dcb, 0x3bf2, 0x3b5f, 0x33ee, 0x24c1, 0x2ce9, 0x3927, 0x305d, 0x3702, 0x3119, 0x35f9, 0x3855, 0x3374, 0x349b, 0x3bcf, 0x2dea, 0x34f0, 0x363f, 0x37da, 0x3a74, 0x35fc, 0x35fa, 0x316b, 0x3804, 0x37a7, 0x3986, +0x3073, 0x3aed, 0x31c7, 0x3844, 0x34a4, 0x387d, 0x3a20, 0x3037, 0x3a00, 0x3b70, 0x377f, 0x3686, 0x3b7e, 0x38b3, 0x32e3, 0x3323, 0x391e, 0x3228, 0x3930, 0x3997, 0x3a5e, 0x398b, 0x3512, 0x35b0, 0x365c, 0x325d, 0x3b61, 0x38b8, 0x39a4, 0x3423, 0x3bd7, 0x38af, +0x2d3d, 0x382d, 0x38ac, 0x26ca, 0x395e, 0x21a8, 0x3520, 0x386f, 0x3b95, 0x32c0, 0x3b84, 0x3a51, 0x3b4b, 0x31d2, 0x3747, 0x3b96, 0x3b40, 0x3535, 0x38d1, 0x3899, 0x3b00, 0x3827, 0x3ae3, 0x38c8, 0x3a07, 0x338d, 0x2e96, 0x3a46, 0x394a, 0x39de, 0x2951, 0x3a02, +0x3838, 0x2d45, 0x28c0, 0x3958, 0x3070, 0x2aa2, 0x3510, 0x38ce, 0x271c, 0x3440, 0x3954, 0x30bc, 0x3b35, 0x2f1d, 0x3afb, 0x2dae, 0x356f, 0x2e13, 0x3981, 0x326d, 0x3a28, 0x3a36, 0x3a95, 0x38cb, 0x38db, 0x3150, 0x2c9e, 0x34c5, 0x3adb, 0x3bdf, 0x38f2, 0x3994, +0x36f8, 0x31c0, 0x3a4f, 0x3825, 0x394b, 0x3a8b, 0x38ac, 0x3167, 0x2e2d, 0x3a93, 0x34f3, 0x37bd, 0x3b63, 0x2f2f, 0x3ae0, 0x3ad8, 0x34a8, 0x2e1c, 0x3890, 0x3705, 0x3b69, 0x3bc1, 0x28af, 0x3b36, 0x348b, 0x3111, 0x3a8d, 0x389c, 0x3916, 0x36dc, 0x3bae, 0x3874, +0x3593, 0x3638, 0x3018, 0x3a56, 0x38a3, 0x2ad4, 0x3a25, 0x38d7, 0x3864, 0x31c1, 0x28d1, 0x39c8, 0x37d6, 0x2c7f, 0x3ba5, 0x34b8, 0x3bef, 0x3b83, 0x3ab5, 0x3062, 0x38bc, 0x399c, 0x2ce4, 0x2f2c, 0x39bf, 0x2ed1, 0x385f, 0x37e0, 0x35ee, 0x397d, 0x3b0c, 0x3049, +0x39d5, 0x322e, 0x3936, 0x3747, 0x2e15, 0x3b41, 0x3874, 0x3bd0, 0x2c04, 0x3800, 0x375b, 0x3b2d, 0x38d8, 0x3a51, 0x3406, 0x38da, 0x38ba, 0x3497, 0x382e, 0x35fc, 0x39d4, 0x3775, 0x3b1e, 0x3813, 0x3649, 0x31af, 0x37bb, 0x334a, 0x3a6e, 0x3284, 0x26e0, 0x2e01, +0x2ebb, 0x344b, 0x3821, 0x381a, 0x385a, 0x2534, 0x3635, 0x2a92, 0x3b8c, 0x31f0, 0x3947, 0x3ac7, 0x3743, 0x3924, 0x39e4, 0x358f, 0x2b62, 0x392c, 0x3955, 0x3341, 0x3676, 0x38ac, 0x3957, 0x335b, 0x2ca2, 0x39ff, 0x37cb, 0x341f, 0x3ac9, 0x3b6c, 0x2f14, 0x34c3, +0x3018, 0x3169, 0x355b, 0x3624, 0x31ed, 0x379e, 0x3268, 0x309b, 0x35db, 0x3872, 0x3bdb, 0x34c7, 0x3408, 0x3359, 0x3920, 0x331f, 0x3866, 0x3af0, 0x2a1a, 0x39e0, 0x3b14, 0x34fa, 0x2d18, 0x3963, 0x35e8, 0x2539, 0x38f5, 0x37b3, 0x378f, 0x31b5, 0x3a6c, 0x3685, +0x3a06, 0x318a, 0x2934, 0x33c1, 0x3be8, 0x375b, 0x3860, 0x3543, 0x3702, 0x3951, 0x3677, 0x37ff, 0x2e27, 0x2e3a, 0x340f, 0x3817, 0x2f04, 0x357e, 0x3a1d, 0x2dd6, 0x252a, 0x3945, 0x162a, 0x3b19, 0x3a53, 0x35d2, 0x3a5d, 0x3474, 0x38e9, 0x374b, 0x387c, 0x1f1a, +0x38ac, 0x3291, 0x3393, 0x3b53, 0x3169, 0x3bca, 0x2f1a, 0x3551, 0x38a3, 0x28e3, 0x369d, 0x34a1, 0x38a8, 0x34c3, 0x3841, 0x390d, 0x3b13, 0x3282, 0x3a29, 0x3a78, 0x2df3, 0x3a37, 0x35f4, 0x35a6, 0x38e8, 0x3328, 0x3beb, 0x390b, 0x32dc, 0x34dc, 0x396d, 0x3a78, +0x39ba, 0x3a06, 0x2cdd, 0x3bc3, 0x2d43, 0x2992, 0x3663, 0x3a68, 0x2c3e, 0x394e, 0x2c9f, 0x380e, 0x37f5, 0x3557, 0x2873, 0x390f, 0x39e7, 0x3939, 0x3669, 0x385c, 0x3a68, 0x32c4, 0x2b04, 0x2d6d, 0x39d3, 0x3895, 0x331d, 0x3b59, 0x3463, 0x2b6a, 0x31de, 0x3296, +0x3aae, 0x3bcd, 0x345a, 0x3897, 0x374b, 0x3bd4, 0x38a2, 0x357f, 0x3402, 0x3a0c, 0x3507, 0x3865, 0x3a54, 0x3878, 0x3859, 0x383e, 0x32b5, 0x34ea, 0x328d, 0x38b6, 0x3464, 0x2f5b, 0x35ff, 0x3817, 0x2f24, 0x3533, 0x3b21, 0x37ba, 0x3837, 0x2e34, 0x3bad, 0x34bc +}; \ No newline at end of file diff --git a/hwpe/redmule_softclear/inc/y_2D.h b/hwpe/redmule_softclear/inc/y_2D.h new file mode 100644 index 0000000..9484a10 --- /dev/null +++ b/hwpe/redmule_softclear/inc/y_2D.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t y_inp_2D [32][32] = { +0x3150, 0x2dc1, 0x3033, 0x31f5, 0x3bb6, 0x3bff, 0x39f9, 0x3662, 0x3720, 0x351d, 0x384b, 0x3093, 0x3b9d, 0x35ad, 0x3695, 0x3466, 0x2300, 0x3445, 0x33ae, 0x3586, 0x38a3, 0x3bdb, 0x33a2, 0x379b, 0x3a0e, 0x38b0, 0x39ba, 0x379b, 0x39d3, 0x3a51, 0x3b30, 0x3794, +0x3b76, 0x3042, 0x38cc, 0x2dfc, 0x3b1a, 0x37fb, 0x38f7, 0x3824, 0x386f, 0x38c7, 0x36ee, 0x3a9c, 0x38d3, 0x2c67, 0x3a80, 0x2f30, 0x3328, 0x3721, 0x3790, 0x34e5, 0x3a6c, 0x3643, 0x3934, 0x3034, 0x38d4, 0x362e, 0x3b4b, 0x3408, 0x30c2, 0x370e, 0x3b31, 0x3b16, +0x3b6b, 0x39d4, 0x339c, 0x381e, 0x313e, 0x3671, 0x3ae2, 0x3479, 0x3940, 0x342d, 0x3925, 0x370a, 0x35d8, 0x2dad, 0x3888, 0x24b9, 0x375d, 0x34bd, 0x3243, 0x2ebb, 0x3970, 0x3a21, 0x3a07, 0x3877, 0x3888, 0x3569, 0x372d, 0x2ac1, 0x331e, 0x384d, 0x3996, 0x34a4, +0x35c1, 0x33a9, 0x21ed, 0x3a42, 0x388d, 0x34e4, 0x33c3, 0x34f9, 0x3a7b, 0x33fb, 0x2cdd, 0x3b0e, 0x333b, 0x3973, 0x34fc, 0x3771, 0x32ea, 0x2de4, 0x31a8, 0x3946, 0x3657, 0x3a4e, 0x36f6, 0x2829, 0x3ba2, 0x3bdc, 0x3bb3, 0x306c, 0x398d, 0x3a1f, 0x3991, 0x3846, +0x3547, 0x3292, 0x2e85, 0x31ed, 0x3979, 0x3a90, 0x28a4, 0x3bed, 0x36d8, 0x340e, 0x3b6a, 0x3ab6, 0x3824, 0x382b, 0x3ac3, 0x3811, 0x36d7, 0x3519, 0x3a92, 0x3a42, 0x29d1, 0x383a, 0x3a9b, 0x300e, 0x2cd3, 0x39cd, 0x3874, 0x3a07, 0x2eb1, 0x3b86, 0x3ad8, 0x3a5d, +0x3712, 0x284a, 0x38c1, 0x3bec, 0x39c0, 0x32cd, 0x3ad8, 0x3bce, 0x3817, 0x3896, 0x3aa7, 0x3870, 0x3996, 0x32cc, 0x3a4c, 0x3757, 0x3814, 0x3b65, 0x3acb, 0x376e, 0x34c0, 0x3609, 0x3bf0, 0x3b24, 0x3b29, 0x3848, 0x34b7, 0x398a, 0x220c, 0x3498, 0x3a8c, 0x3883, +0x38c4, 0x3af6, 0x3a42, 0x2dd6, 0x3147, 0x3717, 0x3a8e, 0x3af9, 0x3296, 0x38ef, 0x34fa, 0x3555, 0x3b29, 0x38de, 0x315e, 0x3773, 0x3b67, 0x3116, 0x38ec, 0x357c, 0x35d0, 0x2518, 0x3958, 0x2a03, 0x37d9, 0x3699, 0x3a1e, 0x3230, 0x3b13, 0x36d4, 0x3b2a, 0x39ad, +0x3b10, 0x351a, 0x3b97, 0x3326, 0x2b54, 0x3b7d, 0x386f, 0x373e, 0x37fa, 0x389b, 0x3b90, 0x3292, 0x3975, 0x38f3, 0x37f1, 0x3590, 0x3810, 0x2fd7, 0x3bf7, 0x3a5a, 0x3a1c, 0x34dd, 0x354c, 0x32f8, 0x3095, 0x321e, 0x39e0, 0x395c, 0x3717, 0x357f, 0x394a, 0x34b1, +0x3ba4, 0x380c, 0x3604, 0x2f50, 0x348d, 0x3828, 0x3a9f, 0x39ce, 0x32ca, 0x3906, 0x3ab2, 0x2ca5, 0x38c9, 0x362a, 0x34b2, 0x29dc, 0x3a36, 0x3052, 0x31b7, 0x3589, 0x387c, 0x3401, 0x3b22, 0x3ad6, 0x3ae8, 0x3238, 0x3494, 0x3502, 0x3717, 0x3a6c, 0x3229, 0x368c, +0x3056, 0x3a56, 0x3498, 0x39eb, 0x2864, 0x342d, 0x39e0, 0x34a1, 0x2b99, 0x3a04, 0x38ff, 0x328c, 0x34d9, 0x387d, 0x3a3c, 0x32e5, 0x39eb, 0x3984, 0x34dd, 0x38a7, 0x373f, 0x39b4, 0x3235, 0x2f58, 0x2f39, 0x3800, 0x3758, 0x3939, 0x39fc, 0x3a4b, 0x38bf, 0x30ee, +0x345e, 0x39c8, 0x3a6d, 0x3262, 0x3b81, 0x31dc, 0x3a15, 0x3bd0, 0x36af, 0x36de, 0x37d5, 0x39d7, 0x3ad3, 0x3ac1, 0x3109, 0x35ea, 0x31c6, 0x398d, 0x3987, 0x3a4a, 0x34d2, 0x2ed2, 0x35e6, 0x352c, 0x39eb, 0x3bd6, 0x3a5b, 0x39d1, 0x34aa, 0x3ade, 0x394b, 0x38a1, +0x2bed, 0x38de, 0x3811, 0x3813, 0x391a, 0x374b, 0x3829, 0x3725, 0x38f0, 0x3583, 0x3966, 0x3a7d, 0x375a, 0x38fe, 0x3696, 0x361c, 0x39a8, 0x35f0, 0x38e1, 0x3003, 0x3595, 0x316e, 0x3862, 0x3af8, 0x3af2, 0x34c8, 0x381d, 0x37d8, 0x3893, 0x3a9c, 0x3989, 0x308c, +0x30cc, 0x2538, 0x399d, 0x3919, 0x399e, 0x21cc, 0x38e9, 0x30f8, 0x3a20, 0x3b3c, 0x3990, 0x259c, 0x3143, 0x3080, 0x3967, 0x3afb, 0x3a1b, 0x3779, 0x2eeb, 0x39f3, 0x379a, 0x369c, 0x3985, 0x3a1b, 0x3ba6, 0x3a53, 0x28d5, 0x3881, 0x31d9, 0x3a34, 0x3bd9, 0x393a, +0x3601, 0x2c6e, 0x3636, 0x3298, 0x39bb, 0x3a08, 0x38db, 0x35ad, 0x3a09, 0x36a6, 0x3bc7, 0x3bac, 0x34ae, 0x3291, 0x290b, 0x3250, 0x2648, 0x333d, 0x2bf3, 0x34b1, 0x30e0, 0x351f, 0x3a74, 0x38dc, 0x3883, 0x2841, 0x35e1, 0x390d, 0x3a50, 0x3abd, 0x386d, 0x3bb7, +0x3b94, 0x36b7, 0x3a49, 0x332f, 0x3a1d, 0x354b, 0x3bab, 0x3346, 0x3417, 0x351e, 0x3b6d, 0x391a, 0x2db3, 0x3b1c, 0x3a4a, 0x37b7, 0x36cf, 0x3a56, 0x39c4, 0x3be9, 0x34f0, 0x39be, 0x3691, 0x1ba5, 0x3888, 0x3040, 0x3ae1, 0x3b9b, 0x398f, 0x3a49, 0x3a16, 0x38c0, +0x386c, 0x39ab, 0x37fa, 0x382c, 0x3a6f, 0x393f, 0x340d, 0x38ef, 0x39d1, 0x3845, 0x398f, 0x363e, 0x3687, 0x3052, 0x3a2b, 0x392c, 0x2f5c, 0x3412, 0x3a1f, 0x3b2f, 0x3bcc, 0x3a63, 0x3a89, 0x36e9, 0x3921, 0x3b80, 0x2dc0, 0x3a03, 0x3beb, 0x38d3, 0x36cb, 0x39a3, +0x3978, 0x3a88, 0x3ba4, 0x3561, 0x28c5, 0x33a0, 0x37be, 0x2c39, 0x30ee, 0x3782, 0x2c07, 0x354e, 0x3491, 0x3a92, 0x331a, 0x3b15, 0x32e1, 0x3839, 0x3afb, 0x36c2, 0x2fd0, 0x29ad, 0x3b2e, 0x39c1, 0x2a8c, 0x341a, 0x2f90, 0x395a, 0x3969, 0x37ea, 0x3a5c, 0x3b6d, +0x3971, 0x3a93, 0x304e, 0x3623, 0x3a22, 0x31ee, 0x29df, 0x2c93, 0x3a01, 0x3a62, 0x366c, 0x371d, 0x3af3, 0x2e08, 0x3ac0, 0x3642, 0x3a28, 0x368d, 0x2d3d, 0x36d9, 0x32c3, 0x373f, 0x36fe, 0x3487, 0x2c81, 0x3623, 0x3b59, 0x3a91, 0x350a, 0x34f4, 0x3b09, 0x2c25, +0x3b13, 0x325a, 0x379e, 0x3a7d, 0x34b1, 0x39d5, 0x2ba8, 0x322b, 0x3b5e, 0x37ab, 0x2e24, 0x3ba9, 0x3a3d, 0x34f7, 0x3ba1, 0x3877, 0x3071, 0x39fb, 0x3bbd, 0x3633, 0x3b36, 0x2daa, 0x3b9b, 0x3aa0, 0x395c, 0x3b8f, 0x38d5, 0x3ab0, 0x3a8f, 0x36c2, 0x3b1f, 0x3489, +0x2acc, 0x3845, 0x3715, 0x37d8, 0x3992, 0x3bff, 0x350e, 0x3ad7, 0x39b0, 0x35ac, 0x3287, 0x385f, 0x3bd4, 0x37a3, 0x3438, 0x39a5, 0x3bcf, 0x38c3, 0x34f6, 0x3ae3, 0x3b57, 0x39af, 0x35eb, 0x3bed, 0x34d4, 0x2a95, 0x3b13, 0x384e, 0x3a3b, 0x33da, 0x3bce, 0x3b99, +0x3559, 0x3335, 0x3a2e, 0x3123, 0x38db, 0x33d0, 0x3638, 0x3b17, 0x3a72, 0x3afc, 0x3936, 0x3838, 0x2b69, 0x3895, 0x3a1a, 0x3192, 0x39d5, 0x37a5, 0x2eb0, 0x2e8b, 0x329a, 0x3b90, 0x390a, 0x3a1e, 0x3847, 0x375d, 0x3873, 0x35e2, 0x3771, 0x30f5, 0x3231, 0x3bd7, +0x2bbc, 0x3ace, 0x31ad, 0x3a6b, 0x28a4, 0x3b48, 0x3ba3, 0x3a84, 0x3353, 0x39f6, 0x381f, 0x2dd6, 0x314c, 0x34af, 0x3929, 0x3921, 0x383b, 0x34b0, 0x3923, 0x32c9, 0x3ae7, 0x318f, 0x3480, 0x2ad8, 0x3042, 0x3a4c, 0x349d, 0x2c12, 0x3abb, 0x3a57, 0x3b0d, 0x3111, +0x3359, 0x3a84, 0x38f2, 0x368d, 0x2f4b, 0x3ba0, 0x395c, 0x3026, 0x3a15, 0x2a04, 0x326e, 0x3522, 0x31a2, 0x382f, 0x2ada, 0x3b7c, 0x2f80, 0x3af5, 0x2d35, 0x38fa, 0x39ab, 0x2c6d, 0x2e7a, 0x39f6, 0x31a4, 0x3a53, 0x358c, 0x3951, 0x3a4e, 0x3916, 0x2a3f, 0x3ae9, +0x3b03, 0x39f8, 0x39fe, 0x3a61, 0x39fb, 0x3704, 0x360d, 0x39a7, 0x37a9, 0x348f, 0x3a30, 0x3af5, 0x366f, 0x3b29, 0x3a6a, 0x33d5, 0x370a, 0x39cd, 0x3444, 0x3bea, 0x3b2b, 0x312e, 0x3b8e, 0x32cf, 0x3b79, 0x3302, 0x3bba, 0x3962, 0x3413, 0x37a1, 0x39e0, 0x3805 +}; \ No newline at end of file diff --git a/hwpe/redmule_softclear/inc/y_input.h b/hwpe/redmule_softclear/inc/y_input.h new file mode 100644 index 0000000..45a2375 --- /dev/null +++ b/hwpe/redmule_softclear/inc/y_input.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t y_inp [768] = { +0x3150, 0x2dc1, 0x3033, 0x31f5, 0x3bb6, 0x3bff, 0x39f9, 0x3662, 0x3720, 0x351d, 0x384b, 0x3093, 0x3b9d, 0x35ad, 0x3695, 0x3466, 0x2300, 0x3445, 0x33ae, 0x3586, 0x38a3, 0x3bdb, 0x33a2, 0x379b, 0x3a0e, 0x38b0, 0x39ba, 0x379b, 0x39d3, 0x3a51, 0x3b30, 0x3794, +0x3b76, 0x3042, 0x38cc, 0x2dfc, 0x3b1a, 0x37fb, 0x38f7, 0x3824, 0x386f, 0x38c7, 0x36ee, 0x3a9c, 0x38d3, 0x2c67, 0x3a80, 0x2f30, 0x3328, 0x3721, 0x3790, 0x34e5, 0x3a6c, 0x3643, 0x3934, 0x3034, 0x38d4, 0x362e, 0x3b4b, 0x3408, 0x30c2, 0x370e, 0x3b31, 0x3b16, +0x3b6b, 0x39d4, 0x339c, 0x381e, 0x313e, 0x3671, 0x3ae2, 0x3479, 0x3940, 0x342d, 0x3925, 0x370a, 0x35d8, 0x2dad, 0x3888, 0x24b9, 0x375d, 0x34bd, 0x3243, 0x2ebb, 0x3970, 0x3a21, 0x3a07, 0x3877, 0x3888, 0x3569, 0x372d, 0x2ac1, 0x331e, 0x384d, 0x3996, 0x34a4, +0x35c1, 0x33a9, 0x21ed, 0x3a42, 0x388d, 0x34e4, 0x33c3, 0x34f9, 0x3a7b, 0x33fb, 0x2cdd, 0x3b0e, 0x333b, 0x3973, 0x34fc, 0x3771, 0x32ea, 0x2de4, 0x31a8, 0x3946, 0x3657, 0x3a4e, 0x36f6, 0x2829, 0x3ba2, 0x3bdc, 0x3bb3, 0x306c, 0x398d, 0x3a1f, 0x3991, 0x3846, +0x3547, 0x3292, 0x2e85, 0x31ed, 0x3979, 0x3a90, 0x28a4, 0x3bed, 0x36d8, 0x340e, 0x3b6a, 0x3ab6, 0x3824, 0x382b, 0x3ac3, 0x3811, 0x36d7, 0x3519, 0x3a92, 0x3a42, 0x29d1, 0x383a, 0x3a9b, 0x300e, 0x2cd3, 0x39cd, 0x3874, 0x3a07, 0x2eb1, 0x3b86, 0x3ad8, 0x3a5d, +0x3712, 0x284a, 0x38c1, 0x3bec, 0x39c0, 0x32cd, 0x3ad8, 0x3bce, 0x3817, 0x3896, 0x3aa7, 0x3870, 0x3996, 0x32cc, 0x3a4c, 0x3757, 0x3814, 0x3b65, 0x3acb, 0x376e, 0x34c0, 0x3609, 0x3bf0, 0x3b24, 0x3b29, 0x3848, 0x34b7, 0x398a, 0x220c, 0x3498, 0x3a8c, 0x3883, +0x38c4, 0x3af6, 0x3a42, 0x2dd6, 0x3147, 0x3717, 0x3a8e, 0x3af9, 0x3296, 0x38ef, 0x34fa, 0x3555, 0x3b29, 0x38de, 0x315e, 0x3773, 0x3b67, 0x3116, 0x38ec, 0x357c, 0x35d0, 0x2518, 0x3958, 0x2a03, 0x37d9, 0x3699, 0x3a1e, 0x3230, 0x3b13, 0x36d4, 0x3b2a, 0x39ad, +0x3b10, 0x351a, 0x3b97, 0x3326, 0x2b54, 0x3b7d, 0x386f, 0x373e, 0x37fa, 0x389b, 0x3b90, 0x3292, 0x3975, 0x38f3, 0x37f1, 0x3590, 0x3810, 0x2fd7, 0x3bf7, 0x3a5a, 0x3a1c, 0x34dd, 0x354c, 0x32f8, 0x3095, 0x321e, 0x39e0, 0x395c, 0x3717, 0x357f, 0x394a, 0x34b1, +0x3ba4, 0x380c, 0x3604, 0x2f50, 0x348d, 0x3828, 0x3a9f, 0x39ce, 0x32ca, 0x3906, 0x3ab2, 0x2ca5, 0x38c9, 0x362a, 0x34b2, 0x29dc, 0x3a36, 0x3052, 0x31b7, 0x3589, 0x387c, 0x3401, 0x3b22, 0x3ad6, 0x3ae8, 0x3238, 0x3494, 0x3502, 0x3717, 0x3a6c, 0x3229, 0x368c, +0x3056, 0x3a56, 0x3498, 0x39eb, 0x2864, 0x342d, 0x39e0, 0x34a1, 0x2b99, 0x3a04, 0x38ff, 0x328c, 0x34d9, 0x387d, 0x3a3c, 0x32e5, 0x39eb, 0x3984, 0x34dd, 0x38a7, 0x373f, 0x39b4, 0x3235, 0x2f58, 0x2f39, 0x3800, 0x3758, 0x3939, 0x39fc, 0x3a4b, 0x38bf, 0x30ee, +0x345e, 0x39c8, 0x3a6d, 0x3262, 0x3b81, 0x31dc, 0x3a15, 0x3bd0, 0x36af, 0x36de, 0x37d5, 0x39d7, 0x3ad3, 0x3ac1, 0x3109, 0x35ea, 0x31c6, 0x398d, 0x3987, 0x3a4a, 0x34d2, 0x2ed2, 0x35e6, 0x352c, 0x39eb, 0x3bd6, 0x3a5b, 0x39d1, 0x34aa, 0x3ade, 0x394b, 0x38a1, +0x2bed, 0x38de, 0x3811, 0x3813, 0x391a, 0x374b, 0x3829, 0x3725, 0x38f0, 0x3583, 0x3966, 0x3a7d, 0x375a, 0x38fe, 0x3696, 0x361c, 0x39a8, 0x35f0, 0x38e1, 0x3003, 0x3595, 0x316e, 0x3862, 0x3af8, 0x3af2, 0x34c8, 0x381d, 0x37d8, 0x3893, 0x3a9c, 0x3989, 0x308c, +0x30cc, 0x2538, 0x399d, 0x3919, 0x399e, 0x21cc, 0x38e9, 0x30f8, 0x3a20, 0x3b3c, 0x3990, 0x259c, 0x3143, 0x3080, 0x3967, 0x3afb, 0x3a1b, 0x3779, 0x2eeb, 0x39f3, 0x379a, 0x369c, 0x3985, 0x3a1b, 0x3ba6, 0x3a53, 0x28d5, 0x3881, 0x31d9, 0x3a34, 0x3bd9, 0x393a, +0x3601, 0x2c6e, 0x3636, 0x3298, 0x39bb, 0x3a08, 0x38db, 0x35ad, 0x3a09, 0x36a6, 0x3bc7, 0x3bac, 0x34ae, 0x3291, 0x290b, 0x3250, 0x2648, 0x333d, 0x2bf3, 0x34b1, 0x30e0, 0x351f, 0x3a74, 0x38dc, 0x3883, 0x2841, 0x35e1, 0x390d, 0x3a50, 0x3abd, 0x386d, 0x3bb7, +0x3b94, 0x36b7, 0x3a49, 0x332f, 0x3a1d, 0x354b, 0x3bab, 0x3346, 0x3417, 0x351e, 0x3b6d, 0x391a, 0x2db3, 0x3b1c, 0x3a4a, 0x37b7, 0x36cf, 0x3a56, 0x39c4, 0x3be9, 0x34f0, 0x39be, 0x3691, 0x1ba5, 0x3888, 0x3040, 0x3ae1, 0x3b9b, 0x398f, 0x3a49, 0x3a16, 0x38c0, +0x386c, 0x39ab, 0x37fa, 0x382c, 0x3a6f, 0x393f, 0x340d, 0x38ef, 0x39d1, 0x3845, 0x398f, 0x363e, 0x3687, 0x3052, 0x3a2b, 0x392c, 0x2f5c, 0x3412, 0x3a1f, 0x3b2f, 0x3bcc, 0x3a63, 0x3a89, 0x36e9, 0x3921, 0x3b80, 0x2dc0, 0x3a03, 0x3beb, 0x38d3, 0x36cb, 0x39a3, +0x3978, 0x3a88, 0x3ba4, 0x3561, 0x28c5, 0x33a0, 0x37be, 0x2c39, 0x30ee, 0x3782, 0x2c07, 0x354e, 0x3491, 0x3a92, 0x331a, 0x3b15, 0x32e1, 0x3839, 0x3afb, 0x36c2, 0x2fd0, 0x29ad, 0x3b2e, 0x39c1, 0x2a8c, 0x341a, 0x2f90, 0x395a, 0x3969, 0x37ea, 0x3a5c, 0x3b6d, +0x3971, 0x3a93, 0x304e, 0x3623, 0x3a22, 0x31ee, 0x29df, 0x2c93, 0x3a01, 0x3a62, 0x366c, 0x371d, 0x3af3, 0x2e08, 0x3ac0, 0x3642, 0x3a28, 0x368d, 0x2d3d, 0x36d9, 0x32c3, 0x373f, 0x36fe, 0x3487, 0x2c81, 0x3623, 0x3b59, 0x3a91, 0x350a, 0x34f4, 0x3b09, 0x2c25, +0x3b13, 0x325a, 0x379e, 0x3a7d, 0x34b1, 0x39d5, 0x2ba8, 0x322b, 0x3b5e, 0x37ab, 0x2e24, 0x3ba9, 0x3a3d, 0x34f7, 0x3ba1, 0x3877, 0x3071, 0x39fb, 0x3bbd, 0x3633, 0x3b36, 0x2daa, 0x3b9b, 0x3aa0, 0x395c, 0x3b8f, 0x38d5, 0x3ab0, 0x3a8f, 0x36c2, 0x3b1f, 0x3489, +0x2acc, 0x3845, 0x3715, 0x37d8, 0x3992, 0x3bff, 0x350e, 0x3ad7, 0x39b0, 0x35ac, 0x3287, 0x385f, 0x3bd4, 0x37a3, 0x3438, 0x39a5, 0x3bcf, 0x38c3, 0x34f6, 0x3ae3, 0x3b57, 0x39af, 0x35eb, 0x3bed, 0x34d4, 0x2a95, 0x3b13, 0x384e, 0x3a3b, 0x33da, 0x3bce, 0x3b99, +0x3559, 0x3335, 0x3a2e, 0x3123, 0x38db, 0x33d0, 0x3638, 0x3b17, 0x3a72, 0x3afc, 0x3936, 0x3838, 0x2b69, 0x3895, 0x3a1a, 0x3192, 0x39d5, 0x37a5, 0x2eb0, 0x2e8b, 0x329a, 0x3b90, 0x390a, 0x3a1e, 0x3847, 0x375d, 0x3873, 0x35e2, 0x3771, 0x30f5, 0x3231, 0x3bd7, +0x2bbc, 0x3ace, 0x31ad, 0x3a6b, 0x28a4, 0x3b48, 0x3ba3, 0x3a84, 0x3353, 0x39f6, 0x381f, 0x2dd6, 0x314c, 0x34af, 0x3929, 0x3921, 0x383b, 0x34b0, 0x3923, 0x32c9, 0x3ae7, 0x318f, 0x3480, 0x2ad8, 0x3042, 0x3a4c, 0x349d, 0x2c12, 0x3abb, 0x3a57, 0x3b0d, 0x3111, +0x3359, 0x3a84, 0x38f2, 0x368d, 0x2f4b, 0x3ba0, 0x395c, 0x3026, 0x3a15, 0x2a04, 0x326e, 0x3522, 0x31a2, 0x382f, 0x2ada, 0x3b7c, 0x2f80, 0x3af5, 0x2d35, 0x38fa, 0x39ab, 0x2c6d, 0x2e7a, 0x39f6, 0x31a4, 0x3a53, 0x358c, 0x3951, 0x3a4e, 0x3916, 0x2a3f, 0x3ae9, +0x3b03, 0x39f8, 0x39fe, 0x3a61, 0x39fb, 0x3704, 0x360d, 0x39a7, 0x37a9, 0x348f, 0x3a30, 0x3af5, 0x366f, 0x3b29, 0x3a6a, 0x33d5, 0x370a, 0x39cd, 0x3444, 0x3bea, 0x3b2b, 0x312e, 0x3b8e, 0x32cf, 0x3b79, 0x3302, 0x3bba, 0x3962, 0x3413, 0x37a1, 0x39e0, 0x3805 +}; \ No newline at end of file diff --git a/hwpe/redmule_softclear/inc/z_2D.h b/hwpe/redmule_softclear/inc/z_2D.h new file mode 100644 index 0000000..aff808a --- /dev/null +++ b/hwpe/redmule_softclear/inc/z_2D.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t z_oup_2D [24][32] = { +0x4845, 0x4897, 0x4608, 0x4838, 0x4855, 0x487b, 0x4869, 0x4880, 0x46d1, 0x48b0, 0x48db, 0x483f, 0x48c9, 0x485f, 0x4881, 0x483a, 0x484b, 0x472c, 0x4762, 0x492b, 0x4822, 0x48fd, 0x488e, 0x492e, 0x483e, 0x484f, 0x49e8, 0x46d7, 0x484b, 0x489d, 0x490b, 0x47e9, +0x484f, 0x47d2, 0x44be, 0x4747, 0x47c7, 0x46c0, 0x4727, 0x48af, 0x46c5, 0x482d, 0x483d, 0x482e, 0x4897, 0x479f, 0x488b, 0x4749, 0x489a, 0x46a8, 0x46f2, 0x488b, 0x4891, 0x47e8, 0x4872, 0x483d, 0x4716, 0x46fd, 0x49b5, 0x46a0, 0x46e7, 0x47a4, 0x48a1, 0x4767, +0x4939, 0x4935, 0x4703, 0x48c1, 0x4863, 0x48bd, 0x4913, 0x48cf, 0x48b6, 0x48b8, 0x4946, 0x4920, 0x495e, 0x48e1, 0x4938, 0x48b2, 0x493a, 0x4882, 0x483b, 0x49d5, 0x4911, 0x4972, 0x496b, 0x49df, 0x48f2, 0x4888, 0x4a46, 0x4821, 0x48c1, 0x490c, 0x49b2, 0x48a3, +0x463a, 0x47b0, 0x44cb, 0x4762, 0x4765, 0x46b9, 0x466a, 0x4814, 0x4631, 0x4796, 0x4666, 0x474b, 0x4798, 0x4704, 0x4838, 0x4761, 0x47d3, 0x4590, 0x45ea, 0x48a2, 0x47f1, 0x4844, 0x484b, 0x4776, 0x47d6, 0x46d8, 0x48f3, 0x44d3, 0x46fa, 0x478d, 0x481e, 0x466e, +0x4827, 0x481e, 0x45a2, 0x4794, 0x4727, 0x4806, 0x475d, 0x48d5, 0x4708, 0x4828, 0x4862, 0x480d, 0x4895, 0x4832, 0x48bd, 0x47f1, 0x482a, 0x46a7, 0x47b1, 0x492d, 0x484d, 0x4884, 0x48dc, 0x485f, 0x476d, 0x480c, 0x48e9, 0x46d3, 0x4728, 0x4884, 0x48a0, 0x480e, +0x4862, 0x4813, 0x4675, 0x485a, 0x47e8, 0x4738, 0x4836, 0x4823, 0x46e7, 0x4821, 0x4822, 0x47b3, 0x4846, 0x4855, 0x4863, 0x4717, 0x4872, 0x47c1, 0x46d5, 0x488e, 0x47e2, 0x485f, 0x487c, 0x48b8, 0x481e, 0x4788, 0x48bd, 0x4677, 0x46c9, 0x47f8, 0x48fe, 0x47fc, +0x47a0, 0x47b2, 0x4588, 0x467e, 0x4662, 0x46c7, 0x46e8, 0x4812, 0x4536, 0x474e, 0x46c0, 0x468f, 0x481f, 0x4679, 0x46a1, 0x46e2, 0x4809, 0x4560, 0x4630, 0x47eb, 0x46b5, 0x4757, 0x4848, 0x477f, 0x46a6, 0x46d8, 0x4870, 0x459a, 0x4670, 0x4678, 0x47d2, 0x468c, +0x4762, 0x48c4, 0x46e3, 0x4791, 0x46b1, 0x486d, 0x47d0, 0x4867, 0x468d, 0x47f6, 0x48a5, 0x4756, 0x4857, 0x4854, 0x4866, 0x4838, 0x484d, 0x46ec, 0x47d2, 0x48f6, 0x484a, 0x4879, 0x4848, 0x483c, 0x471d, 0x4806, 0x48fa, 0x4730, 0x4768, 0x47b8, 0x4865, 0x46f9, +0x48a8, 0x4918, 0x46ca, 0x4867, 0x4800, 0x4862, 0x48d3, 0x4910, 0x474e, 0x4849, 0x48eb, 0x486b, 0x4966, 0x48c5, 0x48f4, 0x4830, 0x48f9, 0x4778, 0x481e, 0x499e, 0x48cf, 0x48f1, 0x4982, 0x4923, 0x487c, 0x47cf, 0x49ea, 0x4649, 0x4773, 0x495e, 0x48b2, 0x483f, +0x48a7, 0x4975, 0x4616, 0x481e, 0x481f, 0x4866, 0x48b6, 0x4864, 0x47dc, 0x4873, 0x485c, 0x487f, 0x4938, 0x491f, 0x490d, 0x48b6, 0x48f8, 0x48a1, 0x4859, 0x492d, 0x489c, 0x4915, 0x4899, 0x4887, 0x486c, 0x4859, 0x49ca, 0x471e, 0x4867, 0x4918, 0x48d3, 0x4827, +0x488b, 0x4998, 0x4704, 0x481d, 0x48b8, 0x4880, 0x4876, 0x4944, 0x470c, 0x48f2, 0x48b9, 0x489b, 0x4956, 0x48e5, 0x48d6, 0x48a5, 0x48dc, 0x4856, 0x484e, 0x49ab, 0x48e0, 0x490e, 0x48dd, 0x4945, 0x488b, 0x48dd, 0x4a32, 0x47ea, 0x4835, 0x4911, 0x4965, 0x4819, +0x460e, 0x481e, 0x452c, 0x4673, 0x475c, 0x4717, 0x46f6, 0x46d0, 0x4696, 0x46bc, 0x4726, 0x481e, 0x4763, 0x46ea, 0x46fe, 0x4758, 0x478b, 0x4627, 0x4704, 0x483f, 0x46ad, 0x47b1, 0x4792, 0x4816, 0x46f2, 0x4684, 0x4827, 0x45a8, 0x472f, 0x47a4, 0x4797, 0x462b, +0x483f, 0x48ab, 0x468f, 0x4863, 0x485a, 0x4766, 0x481d, 0x48cb, 0x47dc, 0x4903, 0x48fc, 0x4830, 0x48cc, 0x483e, 0x48ab, 0x4864, 0x4966, 0x4763, 0x4794, 0x499d, 0x488e, 0x488b, 0x48dc, 0x4960, 0x4854, 0x484c, 0x499c, 0x474c, 0x4826, 0x48bc, 0x4949, 0x4883, +0x489d, 0x4905, 0x4718, 0x481e, 0x48e3, 0x48f4, 0x48c1, 0x4904, 0x47e8, 0x48b3, 0x4892, 0x48d4, 0x48ff, 0x4894, 0x48d5, 0x4886, 0x48fa, 0x4803, 0x47d2, 0x492e, 0x4870, 0x48b2, 0x48e5, 0x492b, 0x487b, 0x4785, 0x49e3, 0x471d, 0x4837, 0x48bf, 0x489b, 0x48c4, +0x475c, 0x4871, 0x464a, 0x4811, 0x47af, 0x471c, 0x4817, 0x4817, 0x463b, 0x484e, 0x477f, 0x464f, 0x4704, 0x487c, 0x47a3, 0x4725, 0x4853, 0x462a, 0x465a, 0x4860, 0x4736, 0x4880, 0x47e1, 0x482b, 0x4811, 0x46c0, 0x48dc, 0x475d, 0x4668, 0x4806, 0x4893, 0x46f4, +0x4858, 0x4959, 0x463d, 0x487b, 0x480f, 0x484e, 0x48c0, 0x48a6, 0x4847, 0x4894, 0x48a0, 0x484a, 0x491e, 0x48f4, 0x48fc, 0x48b5, 0x48ce, 0x47d2, 0x47db, 0x497f, 0x4955, 0x4939, 0x48a7, 0x48ce, 0x4890, 0x4884, 0x49d6, 0x4763, 0x486e, 0x4922, 0x48f4, 0x48c3, +0x47ec, 0x491c, 0x4698, 0x4783, 0x4715, 0x4754, 0x4745, 0x4752, 0x472f, 0x4832, 0x4817, 0x4809, 0x47f8, 0x48c3, 0x47e6, 0x4800, 0x48b6, 0x4730, 0x480a, 0x48cb, 0x479e, 0x488e, 0x47c2, 0x488e, 0x472f, 0x47ee, 0x489d, 0x4744, 0x4755, 0x4851, 0x4846, 0x47d3, +0x4838, 0x48a0, 0x4634, 0x4762, 0x4786, 0x4806, 0x47e3, 0x482d, 0x4726, 0x486c, 0x47b7, 0x4803, 0x48ac, 0x4814, 0x48e0, 0x4839, 0x4827, 0x4750, 0x46f2, 0x48c5, 0x483f, 0x4886, 0x48ad, 0x4856, 0x47e8, 0x47a9, 0x4937, 0x4743, 0x46d0, 0x481f, 0x484c, 0x4804, +0x47fd, 0x481f, 0x456d, 0x4813, 0x474d, 0x4807, 0x4688, 0x480e, 0x46e8, 0x4810, 0x469f, 0x4799, 0x4853, 0x478f, 0x47f2, 0x4824, 0x47d0, 0x471f, 0x46da, 0x485f, 0x4813, 0x481c, 0x482e, 0x4863, 0x4786, 0x480b, 0x48c9, 0x46b8, 0x475a, 0x46e2, 0x4852, 0x46c5, +0x45af, 0x4802, 0x4466, 0x46c2, 0x465d, 0x4743, 0x46b7, 0x47ba, 0x4636, 0x46c3, 0x4677, 0x4784, 0x485a, 0x47c2, 0x46dc, 0x46ac, 0x47de, 0x460e, 0x465f, 0x4834, 0x47f4, 0x4769, 0x46fc, 0x4810, 0x45fd, 0x45ea, 0x48d0, 0x45b5, 0x4704, 0x4783, 0x4830, 0x46c4, +0x4759, 0x47c7, 0x453d, 0x45b0, 0x4741, 0x4702, 0x4736, 0x4793, 0x461b, 0x47ba, 0x470b, 0x46dd, 0x4657, 0x470b, 0x470d, 0x4710, 0x486c, 0x468f, 0x45c3, 0x46ba, 0x479d, 0x483b, 0x46c9, 0x4774, 0x46a9, 0x46a7, 0x4833, 0x4606, 0x4690, 0x46a9, 0x46f5, 0x46a7, +0x47ac, 0x48bb, 0x452c, 0x4803, 0x470f, 0x4824, 0x47d5, 0x48cb, 0x4707, 0x484a, 0x4832, 0x4797, 0x4851, 0x482c, 0x487a, 0x4877, 0x4891, 0x465d, 0x47f4, 0x48ce, 0x4898, 0x4899, 0x484e, 0x486a, 0x47ac, 0x47f0, 0x493e, 0x4611, 0x47e2, 0x489e, 0x488c, 0x46af, +0x4665, 0x4836, 0x45e4, 0x46b6, 0x46a1, 0x46b9, 0x46c8, 0x46dd, 0x4658, 0x474b, 0x467b, 0x4777, 0x4769, 0x4798, 0x4785, 0x475e, 0x472a, 0x4656, 0x45fb, 0x4881, 0x46fc, 0x472d, 0x476e, 0x47a3, 0x465d, 0x46ca, 0x4855, 0x4500, 0x464f, 0x479a, 0x46c3, 0x4738, +0x481e, 0x486c, 0x4659, 0x4801, 0x4756, 0x477a, 0x47d5, 0x487b, 0x4706, 0x4808, 0x484f, 0x4838, 0x4870, 0x4863, 0x48d3, 0x4806, 0x4865, 0x4771, 0x46be, 0x494c, 0x4915, 0x484c, 0x4900, 0x4862, 0x481a, 0x46e8, 0x4974, 0x46a0, 0x4775, 0x483d, 0x487c, 0x480e +}; \ No newline at end of file diff --git a/hwpe/redmule_softclear/inc/z_output.h b/hwpe/redmule_softclear/inc/z_output.h new file mode 100644 index 0000000..96c7e5f --- /dev/null +++ b/hwpe/redmule_softclear/inc/z_output.h @@ -0,0 +1,27 @@ + /* Header file generated by RedMulE Golden Model */ +uint16_t z_oup [768] = { +0x4845, 0x4897, 0x4608, 0x4838, 0x4855, 0x487b, 0x4869, 0x4880, 0x46d1, 0x48b0, 0x48db, 0x483f, 0x48c9, 0x485f, 0x4881, 0x483a, 0x484b, 0x472c, 0x4762, 0x492b, 0x4822, 0x48fd, 0x488e, 0x492e, 0x483e, 0x484f, 0x49e8, 0x46d7, 0x484b, 0x489d, 0x490b, 0x47e9, +0x484f, 0x47d2, 0x44be, 0x4747, 0x47c7, 0x46c0, 0x4727, 0x48af, 0x46c5, 0x482d, 0x483d, 0x482e, 0x4897, 0x479f, 0x488b, 0x4749, 0x489a, 0x46a8, 0x46f2, 0x488b, 0x4891, 0x47e8, 0x4872, 0x483d, 0x4716, 0x46fd, 0x49b5, 0x46a0, 0x46e7, 0x47a4, 0x48a1, 0x4767, +0x4939, 0x4935, 0x4703, 0x48c1, 0x4863, 0x48bd, 0x4913, 0x48cf, 0x48b6, 0x48b8, 0x4946, 0x4920, 0x495e, 0x48e1, 0x4938, 0x48b2, 0x493a, 0x4882, 0x483b, 0x49d5, 0x4911, 0x4972, 0x496b, 0x49df, 0x48f2, 0x4888, 0x4a46, 0x4821, 0x48c1, 0x490c, 0x49b2, 0x48a3, +0x463a, 0x47b0, 0x44cb, 0x4762, 0x4765, 0x46b9, 0x466a, 0x4814, 0x4631, 0x4796, 0x4666, 0x474b, 0x4798, 0x4704, 0x4838, 0x4761, 0x47d3, 0x4590, 0x45ea, 0x48a2, 0x47f1, 0x4844, 0x484b, 0x4776, 0x47d6, 0x46d8, 0x48f3, 0x44d3, 0x46fa, 0x478d, 0x481e, 0x466e, +0x4827, 0x481e, 0x45a2, 0x4794, 0x4727, 0x4806, 0x475d, 0x48d5, 0x4708, 0x4828, 0x4862, 0x480d, 0x4895, 0x4832, 0x48bd, 0x47f1, 0x482a, 0x46a7, 0x47b1, 0x492d, 0x484d, 0x4884, 0x48dc, 0x485f, 0x476d, 0x480c, 0x48e9, 0x46d3, 0x4728, 0x4884, 0x48a0, 0x480e, +0x4862, 0x4813, 0x4675, 0x485a, 0x47e8, 0x4738, 0x4836, 0x4823, 0x46e7, 0x4821, 0x4822, 0x47b3, 0x4846, 0x4855, 0x4863, 0x4717, 0x4872, 0x47c1, 0x46d5, 0x488e, 0x47e2, 0x485f, 0x487c, 0x48b8, 0x481e, 0x4788, 0x48bd, 0x4677, 0x46c9, 0x47f8, 0x48fe, 0x47fc, +0x47a0, 0x47b2, 0x4588, 0x467e, 0x4662, 0x46c7, 0x46e8, 0x4812, 0x4536, 0x474e, 0x46c0, 0x468f, 0x481f, 0x4679, 0x46a1, 0x46e2, 0x4809, 0x4560, 0x4630, 0x47eb, 0x46b5, 0x4757, 0x4848, 0x477f, 0x46a6, 0x46d8, 0x4870, 0x459a, 0x4670, 0x4678, 0x47d2, 0x468c, +0x4762, 0x48c4, 0x46e3, 0x4791, 0x46b1, 0x486d, 0x47d0, 0x4867, 0x468d, 0x47f6, 0x48a5, 0x4756, 0x4857, 0x4854, 0x4866, 0x4838, 0x484d, 0x46ec, 0x47d2, 0x48f6, 0x484a, 0x4879, 0x4848, 0x483c, 0x471d, 0x4806, 0x48fa, 0x4730, 0x4768, 0x47b8, 0x4865, 0x46f9, +0x48a8, 0x4918, 0x46ca, 0x4867, 0x4800, 0x4862, 0x48d3, 0x4910, 0x474e, 0x4849, 0x48eb, 0x486b, 0x4966, 0x48c5, 0x48f4, 0x4830, 0x48f9, 0x4778, 0x481e, 0x499e, 0x48cf, 0x48f1, 0x4982, 0x4923, 0x487c, 0x47cf, 0x49ea, 0x4649, 0x4773, 0x495e, 0x48b2, 0x483f, +0x48a7, 0x4975, 0x4616, 0x481e, 0x481f, 0x4866, 0x48b6, 0x4864, 0x47dc, 0x4873, 0x485c, 0x487f, 0x4938, 0x491f, 0x490d, 0x48b6, 0x48f8, 0x48a1, 0x4859, 0x492d, 0x489c, 0x4915, 0x4899, 0x4887, 0x486c, 0x4859, 0x49ca, 0x471e, 0x4867, 0x4918, 0x48d3, 0x4827, +0x488b, 0x4998, 0x4704, 0x481d, 0x48b8, 0x4880, 0x4876, 0x4944, 0x470c, 0x48f2, 0x48b9, 0x489b, 0x4956, 0x48e5, 0x48d6, 0x48a5, 0x48dc, 0x4856, 0x484e, 0x49ab, 0x48e0, 0x490e, 0x48dd, 0x4945, 0x488b, 0x48dd, 0x4a32, 0x47ea, 0x4835, 0x4911, 0x4965, 0x4819, +0x460e, 0x481e, 0x452c, 0x4673, 0x475c, 0x4717, 0x46f6, 0x46d0, 0x4696, 0x46bc, 0x4726, 0x481e, 0x4763, 0x46ea, 0x46fe, 0x4758, 0x478b, 0x4627, 0x4704, 0x483f, 0x46ad, 0x47b1, 0x4792, 0x4816, 0x46f2, 0x4684, 0x4827, 0x45a8, 0x472f, 0x47a4, 0x4797, 0x462b, +0x483f, 0x48ab, 0x468f, 0x4863, 0x485a, 0x4766, 0x481d, 0x48cb, 0x47dc, 0x4903, 0x48fc, 0x4830, 0x48cc, 0x483e, 0x48ab, 0x4864, 0x4966, 0x4763, 0x4794, 0x499d, 0x488e, 0x488b, 0x48dc, 0x4960, 0x4854, 0x484c, 0x499c, 0x474c, 0x4826, 0x48bc, 0x4949, 0x4883, +0x489d, 0x4905, 0x4718, 0x481e, 0x48e3, 0x48f4, 0x48c1, 0x4904, 0x47e8, 0x48b3, 0x4892, 0x48d4, 0x48ff, 0x4894, 0x48d5, 0x4886, 0x48fa, 0x4803, 0x47d2, 0x492e, 0x4870, 0x48b2, 0x48e5, 0x492b, 0x487b, 0x4785, 0x49e3, 0x471d, 0x4837, 0x48bf, 0x489b, 0x48c4, +0x475c, 0x4871, 0x464a, 0x4811, 0x47af, 0x471c, 0x4817, 0x4817, 0x463b, 0x484e, 0x477f, 0x464f, 0x4704, 0x487c, 0x47a3, 0x4725, 0x4853, 0x462a, 0x465a, 0x4860, 0x4736, 0x4880, 0x47e1, 0x482b, 0x4811, 0x46c0, 0x48dc, 0x475d, 0x4668, 0x4806, 0x4893, 0x46f4, +0x4858, 0x4959, 0x463d, 0x487b, 0x480f, 0x484e, 0x48c0, 0x48a6, 0x4847, 0x4894, 0x48a0, 0x484a, 0x491e, 0x48f4, 0x48fc, 0x48b5, 0x48ce, 0x47d2, 0x47db, 0x497f, 0x4955, 0x4939, 0x48a7, 0x48ce, 0x4890, 0x4884, 0x49d6, 0x4763, 0x486e, 0x4922, 0x48f4, 0x48c3, +0x47ec, 0x491c, 0x4698, 0x4783, 0x4715, 0x4754, 0x4745, 0x4752, 0x472f, 0x4832, 0x4817, 0x4809, 0x47f8, 0x48c3, 0x47e6, 0x4800, 0x48b6, 0x4730, 0x480a, 0x48cb, 0x479e, 0x488e, 0x47c2, 0x488e, 0x472f, 0x47ee, 0x489d, 0x4744, 0x4755, 0x4851, 0x4846, 0x47d3, +0x4838, 0x48a0, 0x4634, 0x4762, 0x4786, 0x4806, 0x47e3, 0x482d, 0x4726, 0x486c, 0x47b7, 0x4803, 0x48ac, 0x4814, 0x48e0, 0x4839, 0x4827, 0x4750, 0x46f2, 0x48c5, 0x483f, 0x4886, 0x48ad, 0x4856, 0x47e8, 0x47a9, 0x4937, 0x4743, 0x46d0, 0x481f, 0x484c, 0x4804, +0x47fd, 0x481f, 0x456d, 0x4813, 0x474d, 0x4807, 0x4688, 0x480e, 0x46e8, 0x4810, 0x469f, 0x4799, 0x4853, 0x478f, 0x47f2, 0x4824, 0x47d0, 0x471f, 0x46da, 0x485f, 0x4813, 0x481c, 0x482e, 0x4863, 0x4786, 0x480b, 0x48c9, 0x46b8, 0x475a, 0x46e2, 0x4852, 0x46c5, +0x45af, 0x4802, 0x4466, 0x46c2, 0x465d, 0x4743, 0x46b7, 0x47ba, 0x4636, 0x46c3, 0x4677, 0x4784, 0x485a, 0x47c2, 0x46dc, 0x46ac, 0x47de, 0x460e, 0x465f, 0x4834, 0x47f4, 0x4769, 0x46fc, 0x4810, 0x45fd, 0x45ea, 0x48d0, 0x45b5, 0x4704, 0x4783, 0x4830, 0x46c4, +0x4759, 0x47c7, 0x453d, 0x45b0, 0x4741, 0x4702, 0x4736, 0x4793, 0x461b, 0x47ba, 0x470b, 0x46dd, 0x4657, 0x470b, 0x470d, 0x4710, 0x486c, 0x468f, 0x45c3, 0x46ba, 0x479d, 0x483b, 0x46c9, 0x4774, 0x46a9, 0x46a7, 0x4833, 0x4606, 0x4690, 0x46a9, 0x46f5, 0x46a7, +0x47ac, 0x48bb, 0x452c, 0x4803, 0x470f, 0x4824, 0x47d5, 0x48cb, 0x4707, 0x484a, 0x4832, 0x4797, 0x4851, 0x482c, 0x487a, 0x4877, 0x4891, 0x465d, 0x47f4, 0x48ce, 0x4898, 0x4899, 0x484e, 0x486a, 0x47ac, 0x47f0, 0x493e, 0x4611, 0x47e2, 0x489e, 0x488c, 0x46af, +0x4665, 0x4836, 0x45e4, 0x46b6, 0x46a1, 0x46b9, 0x46c8, 0x46dd, 0x4658, 0x474b, 0x467b, 0x4777, 0x4769, 0x4798, 0x4785, 0x475e, 0x472a, 0x4656, 0x45fb, 0x4881, 0x46fc, 0x472d, 0x476e, 0x47a3, 0x465d, 0x46ca, 0x4855, 0x4500, 0x464f, 0x479a, 0x46c3, 0x4738, +0x481e, 0x486c, 0x4659, 0x4801, 0x4756, 0x477a, 0x47d5, 0x487b, 0x4706, 0x4808, 0x484f, 0x4838, 0x4870, 0x4863, 0x48d3, 0x4806, 0x4865, 0x4771, 0x46be, 0x494c, 0x4915, 0x484c, 0x4900, 0x4862, 0x481a, 0x46e8, 0x4974, 0x46a0, 0x4775, 0x483d, 0x487c, 0x480e +}; \ No newline at end of file diff --git a/hwpe/redmule_softclear/pulp_inject_fault.tcl b/hwpe/redmule_softclear/pulp_inject_fault.tcl new file mode 100644 index 0000000..61ccadf --- /dev/null +++ b/hwpe/redmule_softclear/pulp_inject_fault.tcl @@ -0,0 +1,53 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 +# +# Author: Michael Rogenmoser (michaero@iis.ee.ethz.ch) + +transcript quietly +if {! [info exists ::env(VSIM_PATH)]} {error "Define VSIM_PATH"} +set utils_base_path [file join $::env(VSIM_PATH) scripts fault_injection_utils] +set script_base_path [file join $::env(VSIM_PATH) fault_injection_sim scripts] + +set verbosity 2 +set log_injections 1 +# Easy way to generate a variable seed +# set seed [clock seconds] +# Default value +set seed 12345 +set print_statistics 1 + +set inject_start_time 550000000000ps +set inject_stop_time 750000000000ps +set injection_clock "pulp_cluster_tb/cluster_i/clk_i" +set injection_clock_trigger 0 +set fault_period 150 +set rand_initial_injection_phase 0 +# max_num set to 0 means until stop_time +set max_num_fault_inject 0 +set signal_fault_duration 20ns +set register_fault_duration 0ns + +set allow_multi_bit_upset $::env(MULTI_BIT_UPSET) +set use_bitwidth_as_weight 0 +set check_core_output_modification 0 +set check_core_next_state_modification 0 +set reg_to_sig_ratio 1 + +source [file join $utils_base_path pulp_extract_nets.tcl] + +set inject_signals_netlist [] +set inject_register_netlist [] +set output_netlist [] +set next_state_netlist [] +set assertion_disable_list [] + +# for {set idx 0} {$idx < 12} {incr idx} { +# set inject_signals_netlist [list {*}$inject_signals_netlist {*}[get_all_core_nets $idx]] +# set output_netlist [list {*}$output_netlist {*}[get_core_output_nets $idx]] +# } + +set inject_register_netlist [list {*}$inject_register_netlist {*}[get_memory_slice {0 16} {256 336}]] + +source [file join $script_base_path inject_fault.tcl] + diff --git a/hwpe/redmule_softclear/redmule.c b/hwpe/redmule_softclear/redmule.c new file mode 100644 index 0000000..a80171c --- /dev/null +++ b/hwpe/redmule_softclear/redmule.c @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2022-2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Yvan Tortorella + * + * RedMulE SW test + */ + +#include +#include "stdio.h" +#include "archi_redmule.h" +#include "hal_redmule.h" +#include "pulp.h" + +static inline void wait_cycles(const unsigned cycles) +{ + /** + * Each iteration of the loop below will take four cycles on RI5CY (one for + * `addi` and three for the taken `bnez`; if the instructions hit in the + * I$). Thus, we let `i` count the number of remaining loop iterations and + * initialize it to a fourth of the number of clock cyles. With this + * initialization, we must not enter the loop if the number of clock cycles + * is less than four, because this will cause an underflow on the first + * subtraction. + */ + register unsigned threshold; + asm volatile("li %[threshold], 4" : [threshold] "=r" (threshold)); + asm volatile goto("ble %[cycles], %[threshold], %l2" + : /* no output */ + : [cycles] "r" (cycles), [threshold] "r" (threshold) + : /* no clobbers */ + : __wait_cycles_end); + register unsigned i = cycles >> 2; +__wait_cycles_start: + // Decrement `i` and loop if it is not yet zero. + asm volatile("addi %0, %0, -1" : "+r" (i)); + asm volatile goto("bnez %0, %l1" + : /* no output */ + : "r" (i) + : /* no clobbers */ + : __wait_cycles_start); +__wait_cycles_end: + return; +} + +int main() { + + volatile int errors = 0; + unsigned int cluster_id = rt_cluster_id(); + unsigned int intc_data_correctable_cnt, redmule_data_correctable_cnt = 0; + unsigned int intc_meta_correctable_cnt = 0; + unsigned int intc_data_uncorrectable_cnt, redmule_data_uncorrectable_cnt = 0; + unsigned int intc_meta_uncorrectable_cnt = 0; + + if(get_core_id() == 0){ + + uint16_t m_size = M_SIZE; + uint16_t n_size = N_SIZE; + uint16_t k_size = K_SIZE; + + uint8_t *x_ext = x_inp; + uint8_t *w_ext = w_inp; + uint8_t *y_ext = y_inp; + uint8_t *z_ext = z_oup; + + uint8_t volatile *x = (uint8_t volatile *) pi_l1_malloc(0, (2*m_size*n_size)); + uint8_t volatile *w = (uint8_t volatile *) pi_l1_malloc(0, (2*n_size*k_size)); + uint8_t volatile *y = (uint8_t volatile *) pi_l1_malloc(0, (2*m_size*k_size)); + uint8_t volatile *z = (uint8_t volatile *) pi_l1_malloc(0, (2*m_size*k_size)); + + #ifdef USE_DMA + volatile unsigned int dma_id = 0; + dma_id = mchan_alloc(); + mchan_transfer((unsigned int) 2*(2*m_size*n_size), + (unsigned int) x_ext, + (unsigned int) x ); + mchan_barrier(dma_id); + mchan_free(dma_id); + + dma_id = mchan_alloc(); + mchan_transfer((unsigned int) 2*(2*n_size*k_size), + (unsigned int) w_ext, + (unsigned int) w ); + mchan_barrier(dma_id); + mchan_free(dma_id); + + dma_id = mchan_alloc(); + mchan_transfer((unsigned int) 2*(2*m_size*k_size), + (unsigned int) y_ext, + (unsigned int) y ); + mchan_barrier(dma_id); + #else + generate_test_data16((int) x, (int) w, (int) y, (int) m_size, (int) n_size, (int) k_size); + #endif + + int gold_sum = 0, check_sum = 0; + int i,j; + + int offload_id_tmp, offload_id; + + // Enable RedMulE + hwpe_cg_enable(); + asm volatile("": : :"memory"); + + hwpe_soft_clear(); + asm volatile("": : :"memory"); + + volatile int job_id = -1; + + // job 0 + job_id = hwpe_wait_acquire(); + asm volatile("": : :"memory"); + redmule_x_add_set ((unsigned int) x); + redmule_w_add_set ((unsigned int) w); + redmule_y_add_set ((unsigned int) y); + redmule_z_add_set ((unsigned int) z); + redmule_cfg (m_size, n_size, k_size, gemm_ops); + asm volatile("": : :"memory"); + hwpe_trigger_job(); + asm volatile("": : :"memory"); + + // job 1 + job_id = hwpe_wait_acquire(); + asm volatile("": : :"memory"); + redmule_x_add_set ((unsigned int) x); + redmule_w_add_set ((unsigned int) w); + redmule_y_add_set ((unsigned int) y); + redmule_z_add_set ((unsigned int) z); + redmule_cfg (m_size, n_size, k_size, gemm_ops); + asm volatile("": : :"memory"); + hwpe_trigger_job(); + asm volatile("": : :"memory"); + + // soft-clear execution + wait_cycles(20); + hwpe_soft_clear(); + wait_cycles(100); + + // job 0 + job_id = hwpe_wait_acquire(); + asm volatile("": : :"memory"); + redmule_x_add_set ((unsigned int) x); + redmule_w_add_set ((unsigned int) w); + redmule_y_add_set ((unsigned int) y); + redmule_z_add_set ((unsigned int) z); + redmule_cfg (m_size, n_size, k_size, gemm_ops); + asm volatile("": : :"memory"); + hwpe_trigger_job(); + asm volatile("": : :"memory"); + + // Wait for end of computation + redmule_evt_wait(); + + // Disable RedMulE + hwpe_cg_disable(); + + errors = redmule_compare16((int) z, (int) m_size, (int) k_size); + + *(int *) 0x1A1040A0 = errors; + + if(job_id != 0) { + printf ("Terminated test with wrong job id!!! and %d errors. See you!\n", errors); + errors++; + } + else { + printf ("Terminated test with OK job id and %d errors. See you!\n", errors); + } + + } + synch_barrier(); + return (errors != 0); +} diff --git a/hwpe/softex/Makefile b/hwpe/softex/Makefile new file mode 100644 index 0000000..a15e398 --- /dev/null +++ b/hwpe/softex/Makefile @@ -0,0 +1,9 @@ +PULP_APP = test +PULP_APP_SRCS = softex.c +PULP_CFLAGS = -O3 + +ifeq ($(use_dma),1) + PULP_CFLAGS += -DUSE_DMA +endif + +include $(PULP_SDK_HOME)/install/rules/pulp_rt.mk diff --git a/hwpe/softex/archi_softex.h b/hwpe/softex/archi_softex.h new file mode 100644 index 0000000..8d08543 --- /dev/null +++ b/hwpe/softex/archi_softex.h @@ -0,0 +1,81 @@ +/* + * Andrea Belano + * + * Copyright 2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __ARCHI_SOFTEX__ +#define __ARCHI_SOFTEX__ + +#define DATA_WIDTH 256 + +/* PULP Cluster Archi defines */ +#define ARCHI_CLUST_CTRL_BASE ARCHI_CLUSTER_CTRL_ADDR +#define ARCHI_CLUST_HWPE_BASE ARCHI_HWCE_ADDR +#define DMA_COMMAND_QUEUE ARCHI_MCHAN_DEMUX_ADDR +#define DMA_STATUS_REGISTER (ARCHI_MCHAN_DEMUX_ADDR + 4) +#define ARCHI_CL_HWPE_EVT0 12 +#define ARCHI_CL_HWPE_EVT1 13 +#define FC_DMA_EVENT 8 +#define CL_DMA_EVENT 22 +#define CLUST_CTRL_HWPE_EN 0x18 +#define CLUST_CTRL_HWPE_EN_MASK 0x800 +#define __builtin_bitinsert(a,b,c,d) (a | (((b << (32-c)) >> (32-c)) << d)) + +#define SOFTEX_BASE_ADD ARCHI_CLUST_HWPE_BASE +#define SOFTEX_CG_EN_MSK 0x4000 +// Commands +#define SOFTEX_TRIGGER 0x00 +#define SOFTEX_ACQUIRE 0x04 +#define SOFTEX_FINISHED 0x08 +#define SOFTEX_STATUS 0x0C +#define SOFTEX_RUNNING_JOB 0x10 +#define SOFTEX_SOFT_CLEAR 0x14 + +#define SOFTEX_REG_OFFS 0x20 + +#define SOFTEX_IN_ADDR SOFTEX_REG_OFFS + 0x00 +#define SOFTEX_OUT_ADDR SOFTEX_REG_OFFS + 0x04 +#define SOFTEX_TOT_LEN SOFTEX_REG_OFFS + 0x08 +#define SOFTEX_COMMANDS SOFTEX_REG_OFFS + 0x0C +#define SOFTEX_CACHE_BASE_ADDR SOFTEX_REG_OFFS + 0x10 +#define SOFTEX_CAST_CTRL SOFTEX_REG_OFFS + 0x14 + + +#define SOFTEX_CMD_ACC_ONLY 0x00000001 +#define SOFTEX_CMD_DIV_ONLY 0x00000002 +#define SOFTEX_CMD_ACQUIRE_SLOT 0x00000004 +#define SOFTEX_CMD_LAST 0x00000008 +#define SOFTEX_CMD_SET_CACHE_ADDR 0x00000010 +#define SOFTEX_CMD_NO_OP 0x00000020 +#define SOFTEX_CMD_INT_INPUT 0x00000040 +#define SOFTEX_CMD_INT_OUTPUT 0x00000080 + +/* DMA Archi */ +#define DMA_TX 0 +#define DMA_RX 1 +#define DMA_INC 1 + +#define PLP_DMA_TYPE_BIT 0x00000011 +#define PLP_DMA_INCR_BIT 0x00000012 +#define PLP_DMA_2D_BIT 0x00000013 +#define PLP_DMA_ELE_BIT 0x00000014 +#define PLP_DMA_ILE_BIT 0x00000015 +#define PLP_DMA_BLE_BIT 0x00000016 +#define PLP_DMA_2D_TCDM_BIT 0x0000017 + +#endif \ No newline at end of file diff --git a/hwpe/softex/hal_softex.h b/hwpe/softex/hal_softex.h new file mode 100644 index 0000000..199a972 --- /dev/null +++ b/hwpe/softex/hal_softex.h @@ -0,0 +1,91 @@ +/* + * Andrea Belano + * + * Copyright 2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __HAL_SOFTEX__ +#define __HAL_SOFTEX__ + +#include +#include "archi_softex.h" + +#define HWPE_WRITE(value, offset) *(volatile int *)(SOFTEX_BASE_ADD + offset) = value +#define HWPE_READ(offset) *(volatile int *)(SOFTEX_BASE_ADD + offset) + +static inline void hwpe_trigger_job() { + HWPE_WRITE(0, SOFTEX_TRIGGER); +} + +static inline int hwpe_acquire_job() { + return HWPE_READ(SOFTEX_ACQUIRE); +} + +static inline unsigned int hwpe_get_status() { + return HWPE_READ(SOFTEX_STATUS); +} + +static inline void hwpe_soft_clear() { + HWPE_WRITE(0, SOFTEX_SOFT_CLEAR); +} + +static inline void hwpe_cg_enable() { + *(volatile int*) (ARCHI_CLUST_CTRL_BASE + CLUST_CTRL_HWPE_EN) |= (CLUST_CTRL_HWPE_EN_MASK | SOFTEX_CG_EN_MSK); +} + +static inline void hwpe_cg_disable() { + *(volatile int*) (ARCHI_CLUST_CTRL_BASE + CLUST_CTRL_HWPE_EN) &= ~(CLUST_CTRL_HWPE_EN_MASK | SOFTEX_CG_EN_MSK); +} + +static inline void softex_evt_wait() { + do { + eu_evt_maskWaitAndClr (1 << ARCHI_CL_HWPE_EVT0); + } while((*(int volatile *)(ARCHI_CLUST_HWPE_BASE + SOFTEX_STATUS)) != 0); +} + +/* DMA APIs */ +static inline int mchan_alloc(){ + return *(volatile int*) DMA_COMMAND_QUEUE; +} + +static inline void mchan_transfer(unsigned int len, + unsigned int ext_addr, + unsigned int tcdm_addr) { + + *(volatile int*) DMA_COMMAND_QUEUE = len | + (DMA_RX << PLP_DMA_TYPE_BIT) | + (DMA_INC << PLP_DMA_INCR_BIT) | + (0 << PLP_DMA_2D_BIT) | + (1 << PLP_DMA_ELE_BIT) | + (1 << PLP_DMA_ILE_BIT) | + (0 << PLP_DMA_BLE_BIT) | + (0 << PLP_DMA_2D_TCDM_BIT); + *(volatile int*) DMA_COMMAND_QUEUE = tcdm_addr; + *(volatile int*) DMA_COMMAND_QUEUE = ext_addr; +} + +static inline void mchan_barrier(int id) { + while(((*(volatile int*)(DMA_STATUS_REGISTER)) >> id ) & 0x1 ) { + eu_evt_maskWaitAndClr(1 << FC_DMA_EVENT); + } +} + +static inline void mchan_free(int id) { + *(volatile int*) DMA_STATUS_REGISTER = 0x1 << id; +} + +#endif \ No newline at end of file diff --git a/hwpe/softex/inc/golden.h b/hwpe/softex/inc/golden.h new file mode 100644 index 0000000..12f0012 --- /dev/null +++ b/hwpe/softex/inc/golden.h @@ -0,0 +1,1031 @@ +#ifndef __SOFTEX_GOLDEN__ +#define __SOFTEX_GOLDEN__ + +#define GOLDEN { \ + 0x31c8, \ + 0x323b, \ + 0x2cba, \ + 0x2bf2, \ + 0x3c72, \ + 0x2c91, \ + 0x39a1, \ + 0x308b, \ + 0x3c56, \ + 0x395d, \ + 0x308b, \ + 0x2f8c, \ + 0x2d45, \ + 0x3693, \ + 0x2613, \ + 0x3a16, \ + 0x32fe, \ + 0x326f, \ + 0x3310, \ + 0x2b82, \ + 0x2fb4, \ + 0x2c91, \ + 0x2f5b, \ + 0x28dd, \ + 0x3730, \ + 0x3b28, \ + 0x3b58, \ + 0x364a, \ + 0x2c1b, \ + 0x3890, \ + 0x2c80, \ + 0x3c89, \ + 0x38d1, \ + 0x2cef, \ + 0x2f2a, \ + 0x29db, \ + 0x2cba, \ + 0x294e, \ + 0x3906, \ + 0x38b8, \ + 0x39cf, \ + 0x3a8c, \ + 0x38d1, \ + 0x3028, \ + 0x3a78, \ + 0x2e7b, \ + 0x2d90, \ + 0x2f5b, \ + 0x31c8, \ + 0x2f9f, \ + 0x29f8, \ + 0x364a, \ + 0x34db, \ + 0x33fb, \ + 0x3ae8, \ + 0x381a, \ + 0x3028, \ + 0x35bf, \ + 0x2689, \ + 0x364a, \ + 0x3211, \ + 0x3791, \ + 0x33c3, \ + 0x3748, \ + 0x3a5b, \ + 0x39a1, \ + 0x2672, \ + 0x276f, \ + 0x34db, \ + 0x2656, \ + 0x2cef, \ + 0x3c13, \ + 0x33ac, \ + 0x3386, \ + 0x39a1, \ + 0x3058, \ + 0x3253, \ + 0x3b75, \ + 0x3a9f, \ + 0x2753, \ + 0x360b, \ + 0x309d, \ + 0x3568, \ + 0x3c89, \ + 0x276f, \ + 0x28ac, \ + 0x35bf, \ + 0x38a3, \ + 0x344f, \ + 0x2ea1, \ + 0x3003, \ + 0x3c3d, \ + 0x3a16, \ + 0x2626, \ + 0x2d2e, \ + 0x3436, \ + 0x37f0, \ + 0x31c8, \ + 0x36bd, \ + 0x2db8, \ + 0x3386, \ + 0x3a41, \ + 0x33dd, \ + 0x3200, \ + 0x2823, \ + 0x3c02, \ + 0x2dd1, \ + 0x3225, \ + 0x2898, \ + 0x340e, \ + 0x387e, \ + 0x3808, \ + 0x30b2, \ + 0x381a, \ + 0x371c, \ + 0x2689, \ + 0x27ae, \ + 0x3be5, \ + 0x2b32, \ + 0x33ac, \ + 0x3c56, \ + 0x2656, \ + 0x2c48, \ + 0x2cd3, \ + 0x2672, \ + 0x3a5b, \ + 0x3583, \ + 0x3943, \ + 0x2bd6, \ + 0x2a67, \ + 0x387e, \ + 0x3c3d, \ + 0x3709, \ + 0x3ab4, \ + 0x3534, \ + 0x3c13, \ + 0x29aa, \ + 0x2a95, \ + 0x269b, \ + 0x346a, \ + 0x3211, \ + 0x3943, \ + 0x2921, \ + 0x3b9d, \ + 0x2cd3, \ + 0x3a41, \ + 0x3890, \ + 0x323b, \ + 0x2cef, \ + 0x2851, \ + 0x3225, \ + 0x2b4a, \ + 0x2bbd, \ + 0x2a67, \ + 0x3351, \ + 0x3351, \ + 0x2823, \ + 0x3028, \ + 0x382e, \ + 0x3808, \ + 0x2cba, \ + 0x3665, \ + 0x2d2e, \ + 0x2ca5, \ + 0x2db8, \ + 0x3cb0, \ + 0x2fb4, \ + 0x2838, \ + 0x36a7, \ + 0x3253, \ + 0x2810, \ + 0x2753, \ + 0x395d, \ + 0x2f41, \ + 0x36bd, \ + 0x279a, \ + 0x3126, \ + 0x3172, \ + 0x336d, \ + 0x2f05, \ + 0x2f9f, \ + 0x35a8, \ + 0x392c, \ + 0x3015, \ + 0x3780, \ + 0x303f, \ + 0x2e8e, \ + 0x36a7, \ + 0x39cf, \ + 0x34db, \ + 0x2f41, \ + 0x39a1, \ + 0x3ce2, \ + 0x28fa, \ + 0x2a4c, \ + 0x38ed, \ + 0x2936, \ + 0x3ab4, \ + 0x3102, \ + 0x30ca, \ + 0x3be5, \ + 0x31b0, \ + 0x2dd1, \ + 0x2e18, \ + 0x36a7, \ + 0x361d, \ + 0x27fd, \ + 0x2eb6, \ + 0x3a16, \ + 0x2eb6, \ + 0x2a67, \ + 0x37d3, \ + 0x3436, \ + 0x33c3, \ + 0x336d, \ + 0x27c5, \ + 0x2f5b, \ + 0x360b, \ + 0x2cd3, \ + 0x387e, \ + 0x3568, \ + 0x3890, \ + 0x3b9d, \ + 0x3be5, \ + 0x2788, \ + 0x27fd, \ + 0x2886, \ + 0x3126, \ + 0x2eea, \ + 0x2753, \ + 0x2f8c, \ + 0x34db, \ + 0x2d2e, \ + 0x3075, \ + 0x360b, \ + 0x2a1f, \ + 0x3595, \ + 0x3682, \ + 0x2eb6, \ + 0x29db, \ + 0x395d, \ + 0x2f2a, \ + 0x2bbd, \ + 0x3906, \ + 0x263c, \ + 0x2e06, \ + 0x3acc, \ + 0x2700, \ + 0x28dd, \ + 0x2f16, \ + 0x2a67, \ + 0x309d, \ + 0x2b1d, \ + 0x2aa8, \ + 0x3709, \ + 0x28fa, \ + 0x3436, \ + 0x3534, \ + 0x3b03, \ + 0x2fe7, \ + 0x2aa8, \ + 0x2823, \ + 0x326f, \ + 0x32ae, \ + 0x3c72, \ + 0x29db, \ + 0x3253, \ + 0x29db, \ + 0x3225, \ + 0x2c91, \ + 0x2700, \ + 0x2bf2, \ + 0x3bca, \ + 0x34f8, \ + 0x29db, \ + 0x3568, \ + 0x27c5, \ + 0x3730, \ + 0x2672, \ + 0x3730, \ + 0x3a2a, \ + 0x303f, \ + 0x32c5, \ + 0x33c3, \ + 0x3693, \ + 0x37f0, \ + 0x33ac, \ + 0x3906, \ + 0x313d, \ + 0x3780, \ + 0x2cba, \ + 0x2c30, \ + 0x3c72, \ + 0x2cd3, \ + 0x2810, \ + 0x263c, \ + 0x382e, \ + 0x28fa, \ + 0x382e, \ + 0x395d, \ + 0x28ac, \ + 0x38a3, \ + 0x32e0, \ + 0x2724, \ + 0x31b0, \ + 0x2613, \ + 0x273a, \ + 0x2cef, \ + 0x29c1, \ + 0x2b93, \ + 0x2f78, \ + 0x2e2c, \ + 0x344f, \ + 0x2b93, \ + 0x2996, \ + 0x2d60, \ + 0x3ce2, \ + 0x3be5, \ + 0x2672, \ + 0x28c3, \ + 0x2ded, \ + 0x34db, \ + 0x2b4a, \ + 0x38d1, \ + 0x27ae, \ + 0x38ed, \ + 0x2c1b, \ + 0x3172, \ + 0x3288, \ + 0x2810, \ + 0x2bbd, \ + 0x3b03, \ + 0x326f, \ + 0x3b15, \ + 0x3b9d, \ + 0x2a0c, \ + 0x3ae8, \ + 0x3846, \ + 0x2e8e, \ + 0x2ecf, \ + 0x286d, \ + 0x3b28, \ + 0x2eea, \ + 0x3156, \ + 0x2c48, \ + 0x3ab4, \ + 0x35bf, \ + 0x2c48, \ + 0x32c5, \ + 0x2d90, \ + 0x3015, \ + 0x3860, \ + 0x3be5, \ + 0x2ded, \ + 0x3a2a, \ + 0x3b03, \ + 0x32ae, \ + 0x346a, \ + 0x3890, \ + 0x34f8, \ + 0x3a41, \ + 0x3b58, \ + 0x36bd, \ + 0x3bb2, \ + 0x2b4a, \ + 0x39a1, \ + 0x3693, \ + 0x2a83, \ + 0x2d1a, \ + 0x3890, \ + 0x3918, \ + 0x3762, \ + 0x3a2a, \ + 0x3860, \ + 0x2f9f, \ + 0x3ce2, \ + 0x3730, \ + 0x34c1, \ + 0x279a, \ + 0x35a8, \ + 0x2ba6, \ + 0x392c, \ + 0x33ac, \ + 0x3943, \ + 0x2da3, \ + 0x2eea, \ + 0x344f, \ + 0x35d8, \ + 0x2ea1, \ + 0x3c13, \ + 0x2a95, \ + 0x2bf2, \ + 0x2823, \ + 0x36f2, \ + 0x381a, \ + 0x3a05, \ + 0x3846, \ + 0x308b, \ + 0x3ae8, \ + 0x3b8b, \ + 0x354c, \ + 0x3b9d, \ + 0x2e2c, \ + 0x273a, \ + 0x3323, \ + 0x28fa, \ + 0x2b82, \ + 0x34aa, \ + 0x2cef, \ + 0x26e2, \ + 0x392c, \ + 0x2cba, \ + 0x2a67, \ + 0x2b4a, \ + 0x2a67, \ + 0x3748, \ + 0x371c, \ + 0x3288, \ + 0x32c5, \ + 0x3c3d, \ + 0x2e2c, \ + 0x3791, \ + 0x2e5d, \ + 0x3075, \ + 0x3a05, \ + 0x2e8e, \ + 0x32fe, \ + 0x3c02, \ + 0x27fd, \ + 0x2cba, \ + 0x3c89, \ + 0x2838, \ + 0x2700, \ + 0x2e43, \ + 0x3386, \ + 0x37f0, \ + 0x3421, \ + 0x354c, \ + 0x31e2, \ + 0x2996, \ + 0x30b2, \ + 0x2f8c, \ + 0x2a0c, \ + 0x3c02, \ + 0x32e0, \ + 0x2a83, \ + 0x2886, \ + 0x3568, \ + 0x326f, \ + 0x2788, \ + 0x398e, \ + 0x371c, \ + 0x2851, \ + 0x269b, \ + 0x2613, \ + 0x2672, \ + 0x364a, \ + 0x2bf2, \ + 0x387e, \ + 0x2f78, \ + 0x3398, \ + 0x2851, \ + 0x2838, \ + 0x39b6, \ + 0x2e18, \ + 0x3c3d, \ + 0x39b6, \ + 0x3943, \ + 0x28fa, \ + 0x2a83, \ + 0x3cb0, \ + 0x2f2a, \ + 0x2e7b, \ + 0x2f78, \ + 0x3bca, \ + 0x37bb, \ + 0x279a, \ + 0x2e7b, \ + 0x3a9f, \ + 0x392c, \ + 0x27c5, \ + 0x3156, \ + 0x37a5, \ + 0x3c56, \ + 0x3172, \ + 0x2d2e, \ + 0x3665, \ + 0x2fb4, \ + 0x2d08, \ + 0x361d, \ + 0x2abf, \ + 0x36f2, \ + 0x2cd3, \ + 0x30e5, \ + 0x2fb4, \ + 0x2a1f, \ + 0x3cc8, \ + 0x38a3, \ + 0x397b, \ + 0x2f9f, \ + 0x39b6, \ + 0x3200, \ + 0x3253, \ + 0x3156, \ + 0x2724, \ + 0x296a, \ + 0x3a5b, \ + 0x2c30, \ + 0x3398, \ + 0x2f5b, \ + 0x2cef, \ + 0x39ea, \ + 0x3906, \ + 0x3b3f, \ + 0x2724, \ + 0x2753, \ + 0x3200, \ + 0x2a1f, \ + 0x329a, \ + 0x32ae, \ + 0x2b32, \ + 0x33c3, \ + 0x3075, \ + 0x3126, \ + 0x37a5, \ + 0x361d, \ + 0x32fe, \ + 0x3665, \ + 0x263c, \ + 0x2c09, \ + 0x3709, \ + 0x29db, \ + 0x3682, \ + 0x39cf, \ + 0x2ecf, \ + 0x3d00, \ + 0x30ca, \ + 0x3421, \ + 0x3890, \ + 0x2f2a, \ + 0x3172, \ + 0x263c, \ + 0x2689, \ + 0x39ea, \ + 0x2cef, \ + 0x26c8, \ + 0x3015, \ + 0x2936, \ + 0x2689, \ + 0x3943, \ + 0x2db8, \ + 0x37d3, \ + 0x3534, \ + 0x3682, \ + 0x32fe, \ + 0x2b32, \ + 0x2f05, \ + 0x3421, \ + 0x2613, \ + 0x2b4a, \ + 0x3189, \ + 0x3310, \ + 0x2bf2, \ + 0x2f41, \ + 0x3780, \ + 0x3386, \ + 0x3102, \ + 0x326f, \ + 0x2cd3, \ + 0x3c3d, \ + 0x2eb6, \ + 0x33c3, \ + 0x2f5b, \ + 0x2bbd, \ + 0x3b03, \ + 0x2cba, \ + 0x3780, \ + 0x346a, \ + 0x398e, \ + 0x2c30, \ + 0x28fa, \ + 0x2a95, \ + 0x3ae8, \ + 0x2c91, \ + 0x381a, \ + 0x2788, \ + 0x26b0, \ + 0x34f8, \ + 0x3c27, \ + 0x34db, \ + 0x3780, \ + 0x3398, \ + 0x3b58, \ + 0x3172, \ + 0x31e2, \ + 0x350c, \ + 0x3bb2, \ + 0x3748, \ + 0x350c, \ + 0x3b75, \ + 0x34aa, \ + 0x2eb6, \ + 0x381a, \ + 0x2cba, \ + 0x2b1d, \ + 0x2613, \ + 0x3496, \ + 0x2abf, \ + 0x3102, \ + 0x2936, \ + 0x3351, \ + 0x2bbd, \ + 0x3496, \ + 0x3398, \ + 0x3015, \ + 0x2a34, \ + 0x2613, \ + 0x2e43, \ + 0x2ded, \ + 0x29db, \ + 0x37a5, \ + 0x296a, \ + 0x2bbd, \ + 0x3bca, \ + 0x37a5, \ + 0x3a2a, \ + 0x2f9f, \ + 0x3709, \ + 0x29c1, \ + 0x3bca, \ + 0x398e, \ + 0x3595, \ + 0x29f8, \ + 0x2da3, \ + 0x3568, \ + 0x3485, \ + 0x2da3, \ + 0x38a3, \ + 0x3323, \ + 0x2d90, \ + 0x344f, \ + 0x26b0, \ + 0x2a0c, \ + 0x3534, \ + 0x3b03, \ + 0x3b28, \ + 0x3c72, \ + 0x2f5b, \ + 0x2a34, \ + 0x3436, \ + 0x30ca, \ + 0x3730, \ + 0x3211, \ + 0x32e0, \ + 0x3126, \ + 0x3003, \ + 0x395d, \ + 0x294e, \ + 0x38ed, \ + 0x3c9c, \ + 0x2996, \ + 0x2b32, \ + 0x313d, \ + 0x350c, \ + 0x2d90, \ + 0x3d11, \ + 0x2eb6, \ + 0x2f78, \ + 0x39a1, \ + 0x3351, \ + 0x2da3, \ + 0x2985, \ + 0x3225, \ + 0x2c62, \ + 0x3ce2, \ + 0x3682, \ + 0x39cf, \ + 0x2aa8, \ + 0x2ea1, \ + 0x2d1a, \ + 0x344f, \ + 0x290e, \ + 0x2700, \ + 0x2af5, \ + 0x3808, \ + 0x3398, \ + 0x3211, \ + 0x2bbd, \ + 0x3780, \ + 0x3323, \ + 0x3172, \ + 0x350c, \ + 0x3acc, \ + 0x3058, \ + 0x3ab4, \ + 0x3200, \ + 0x2fb4, \ + 0x3cb0, \ + 0x2cba, \ + 0x3015, \ + 0x354c, \ + 0x26e2, \ + 0x2cef, \ + 0x3b28, \ + 0x2c30, \ + 0x3c02, \ + 0x2700, \ + 0x2788, \ + 0x2fb4, \ + 0x28ac, \ + 0x3b03, \ + 0x33c3, \ + 0x34db, \ + 0x3b28, \ + 0x3780, \ + 0x3b58, \ + 0x3d00, \ + 0x3075, \ + 0x28fa, \ + 0x34db, \ + 0x34db, \ + 0x279a, \ + 0x2ea1, \ + 0x3386, \ + 0x37bb, \ + 0x35bf, \ + 0x2eb6, \ + 0x371c, \ + 0x2e7b, \ + 0x2d90, \ + 0x2f78, \ + 0x32ae, \ + 0x3b03, \ + 0x269b, \ + 0x2ba6, \ + 0x2ba6, \ + 0x364a, \ + 0x2c91, \ + 0x279a, \ + 0x28fa, \ + 0x3a9f, \ + 0x33c3, \ + 0x2abf, \ + 0x3a2a, \ + 0x3846, \ + 0x2f2a, \ + 0x2b93, \ + 0x397b, \ + 0x2b4a, \ + 0x371c, \ + 0x2fcc, \ + 0x2a1f, \ + 0x37f0, \ + 0x3943, \ + 0x3730, \ + 0x3d00, \ + 0x2c80, \ + 0x346a, \ + 0x39a1, \ + 0x32ae, \ + 0x308b, \ + 0x39cf, \ + 0x32e0, \ + 0x3c02, \ + 0x2aa8, \ + 0x3568, \ + 0x3c89, \ + 0x2724, \ + 0x296a, \ + 0x3acc, \ + 0x296a, \ + 0x2e7b, \ + 0x2e5d, \ + 0x2936, \ + 0x3c27, \ + 0x3a9f, \ + 0x2ca5, \ + 0x2a4c, \ + 0x2f41, \ + 0x2b82, \ + 0x28c3, \ + 0x39a1, \ + 0x286d, \ + 0x308b, \ + 0x3253, \ + 0x395d, \ + 0x2e8e, \ + 0x2db8, \ + 0x3906, \ + 0x2c48, \ + 0x3310, \ + 0x326f, \ + 0x2810, \ + 0x3172, \ + 0x3632, \ + 0x2b93, \ + 0x3a5b, \ + 0x34db, \ + 0x290e, \ + 0x294e, \ + 0x3c3d, \ + 0x3534, \ + 0x29c1, \ + 0x37a5, \ + 0x39b6, \ + 0x3c13, \ + 0x3acc, \ + 0x269b, \ + 0x33dd, \ + 0x29db, \ + 0x2851, \ + 0x323b, \ + 0x3436, \ + 0x3709, \ + 0x26c8, \ + 0x2656, \ + 0x354c, \ + 0x2810, \ + 0x3ab4, \ + 0x3b9d, \ + 0x371c, \ + 0x2838, \ + 0x3386, \ + 0x3b15, \ + 0x3bca, \ + 0x32e0, \ + 0x2a67, \ + 0x2838, \ + 0x2c91, \ + 0x3189, \ + 0x279a, \ + 0x3225, \ + 0x27c5, \ + 0x3730, \ + 0x2cd3, \ + 0x269b, \ + 0x3225, \ + 0x309d, \ + 0x26e2, \ + 0x2a83, \ + 0x36a7, \ + 0x34aa, \ + 0x2bf2, \ + 0x387e, \ + 0x3c3d, \ + 0x28dd, \ + 0x3730, \ + 0x3253, \ + 0x2ca5, \ + 0x3338, \ + 0x2a67, \ + 0x2ea1, \ + 0x2d7d, \ + 0x3496, \ + 0x2cba, \ + 0x2e2c, \ + 0x2985, \ + 0x3126, \ + 0x2ecf, \ + 0x2e43, \ + 0x32c5, \ + 0x3be5, \ + 0x2e18, \ + 0x3386, \ + 0x3906, \ + 0x2689, \ + 0x33fb, \ + 0x31e2, \ + 0x3762, \ + 0x2b65, \ + 0x3a5b, \ + 0x3a8c, \ + 0x263c, \ + 0x27ae, \ + 0x2a4c, \ + 0x2d60, \ + 0x3860, \ + 0x303f, \ + 0x2724, \ + 0x344f, \ + 0x2d08, \ + 0x351f, \ + 0x361d, \ + 0x30ca, \ + 0x37bb, \ + 0x3ce2, \ + 0x2724, \ + 0x3ab4, \ + 0x2b65, \ + 0x3860, \ + 0x33c3, \ + 0x3015, \ + 0x2c1b, \ + 0x3189, \ + 0x2886, \ + 0x30b2, \ + 0x2996, \ + 0x3288, \ + 0x3918, \ + 0x2b93, \ + 0x2f9f, \ + 0x2613, \ + 0x33fb, \ + 0x3791, \ + 0x3c13, \ + 0x3113, \ + 0x2af5, \ + 0x3791, \ + 0x2e43, \ + 0x2ca5, \ + 0x27ae, \ + 0x3b15, \ + 0x34aa, \ + 0x3338, \ + 0x39a1, \ + 0x387e, \ + 0x3172, \ + 0x2b0b, \ + 0x34aa, \ + 0x2ded, \ + 0x3be5, \ + 0x3323, \ + 0x3568, \ + 0x2f41, \ + 0x2bd6, \ + 0x3568, \ + 0x34c1, \ + 0x2689, \ + 0x2672, \ + 0x2eb6, \ + 0x3b8b, \ + 0x3058, \ + 0x3113, \ + 0x3583, \ + 0x2a34, \ + 0x3906, \ + 0x3bb2, \ + 0x2eea, \ + 0x2ad8, \ + 0x319c, \ + 0x3693, \ + 0x33ac, \ + 0x38b8, \ + 0x2689, \ + 0x31e2, \ + 0x33ac, \ + 0x27ae, \ + 0x2bbd, \ + 0x360b, \ + 0x2985, \ + 0x28ac, \ + 0x2b1d, \ + 0x336d, \ + 0x35bf, \ + 0x323b, \ + 0x2e8e, \ + 0x351f, \ + 0x2b82, \ + 0x313d, \ + 0x3421, \ + 0x2f9f, \ + 0x3b8b, \ + 0x2fe7, \ + 0x360b, \ + 0x2b32, \ + 0x2b32, \ + 0x27c5, \ + 0x273a, \ + 0x3b75, \ + 0x2626, \ + 0x2f2a, \ + 0x296a, \ + 0x3730, \ + 0x37d3, \ + 0x2700, \ + 0x395d, \ + 0x30ca, \ + 0x2921, \ + 0x3156, \ + 0x3c9c, \ + 0x2fe7, \ + 0x2788, \ + 0x2bd6, \ + 0x35bf, \ + 0x3a16, \ + 0x36a7, \ + 0x351f, \ + 0x395d, \ + 0x3288, \ + 0x3a2a, \ + 0x37a5, \ + 0x2bbd, \ + 0x2d1a, \ + 0x34db, \ + 0x3126, \ + 0x2fe7, \ + 0x33fb, \ + 0x294e, \ + 0x2bbd, \ + 0x3156, \ + 0x2e7b, \ + 0x2a67, \ + 0x38ed, \ + 0x397b, \ + 0x30ca, \ + 0x2f78, \ + 0x2a0c, \ + 0x3075, \ + 0x2d2e, \ + 0x2656, \ + 0x3a16, \ + 0x2a83, \ + 0x2c80, \ + 0x3693, \ + 0x354c, \ + 0x2788, \ + 0x36f2, \ + 0x37f0, \ + 0x27ae, \ + 0x34f8, \ + 0x2b82, \ + 0x35a8, \ + 0x3632, \ + 0x2f16, \ +} + +#endif \ No newline at end of file diff --git a/hwpe/softex/inc/scores.h b/hwpe/softex/inc/scores.h new file mode 100644 index 0000000..d60ba7a --- /dev/null +++ b/hwpe/softex/inc/scores.h @@ -0,0 +1,1037 @@ +#ifndef __SOFTEX_SCORES__ +#define __SOFTEX_SCORES__ + +#define LENGTH 1024 + +#define FMT_WIDTH 2 + +#define N_VECTORS 1 + +#define SCORES { \ + 0x4182, \ + 0x4187, \ + 0x4114, \ + 0x4102, \ + 0x41f8, \ + 0x4110, \ + 0x41d9, \ + 0x4168, \ + 0x41f7, \ + 0x41d6, \ + 0x4168, \ + 0x4152, \ + 0x4120, \ + 0x41b7, \ + 0x0000, \ + 0x41de, \ + 0x418f, \ + 0x4189, \ + 0x4190, \ + 0x40f0, \ + 0x4156, \ + 0x4110, \ + 0x414e, \ + 0x4078, \ + 0x41be, \ + 0x41ea, \ + 0x41ec, \ + 0x41b4, \ + 0x4106, \ + 0x41cd, \ + 0x410e, \ + 0x41f9, \ + 0x41d0, \ + 0x4118, \ + 0x414a, \ + 0x40a8, \ + 0x4114, \ + 0x4090, \ + 0x41d2, \ + 0x41cf, \ + 0x41db, \ + 0x41e3, \ + 0x41d0, \ + 0x4160, \ + 0x41e2, \ + 0x413a, \ + 0x4126, \ + 0x414e, \ + 0x4182, \ + 0x4154, \ + 0x40ac, \ + 0x41b4, \ + 0x41a4, \ + 0x419a, \ + 0x41e7, \ + 0x41c8, \ + 0x4160, \ + 0x41ae, \ + 0x3f20, \ + 0x41b4, \ + 0x4185, \ + 0x41c2, \ + 0x4198, \ + 0x41bf, \ + 0x41e1, \ + 0x41d9, \ + 0x3f00, \ + 0x3ff0, \ + 0x41a4, \ + 0x3ec0, \ + 0x4118, \ + 0x41f4, \ + 0x4197, \ + 0x4195, \ + 0x41d9, \ + 0x4164, \ + 0x4188, \ + 0x41ed, \ + 0x41e4, \ + 0x3fe0, \ + 0x41b1, \ + 0x416a, \ + 0x41aa, \ + 0x41f9, \ + 0x3ff0, \ + 0x4068, \ + 0x41ae, \ + 0x41ce, \ + 0x419e, \ + 0x413e, \ + 0x415c, \ + 0x41f6, \ + 0x41de, \ + 0x3e00, \ + 0x411e, \ + 0x419d, \ + 0x41c6, \ + 0x4182, \ + 0x41b9, \ + 0x412a, \ + 0x4195, \ + 0x41e0, \ + 0x4199, \ + 0x4184, \ + 0x4038, \ + 0x41f3, \ + 0x412c, \ + 0x4186, \ + 0x4060, \ + 0x419b, \ + 0x41cc, \ + 0x41c7, \ + 0x416c, \ + 0x41c8, \ + 0x41bd, \ + 0x3f20, \ + 0x4010, \ + 0x41f2, \ + 0x40e4, \ + 0x4197, \ + 0x41f7, \ + 0x3ec0, \ + 0x410a, \ + 0x4116, \ + 0x3f00, \ + 0x41e1, \ + 0x41ab, \ + 0x41d5, \ + 0x4100, \ + 0x40c0, \ + 0x41cc, \ + 0x41f6, \ + 0x41bc, \ + 0x41e5, \ + 0x41a8, \ + 0x41f4, \ + 0x40a0, \ + 0x40c8, \ + 0x3f40, \ + 0x419f, \ + 0x4185, \ + 0x41d5, \ + 0x4088, \ + 0x41ef, \ + 0x4116, \ + 0x41e0, \ + 0x41cd, \ + 0x4187, \ + 0x4118, \ + 0x4048, \ + 0x4186, \ + 0x40e8, \ + 0x40fc, \ + 0x40c0, \ + 0x4193, \ + 0x4193, \ + 0x4038, \ + 0x4160, \ + 0x41c9, \ + 0x41c7, \ + 0x4114, \ + 0x41b5, \ + 0x411e, \ + 0x4112, \ + 0x412a, \ + 0x41fb, \ + 0x4156, \ + 0x4040, \ + 0x41b8, \ + 0x4188, \ + 0x4030, \ + 0x3fe0, \ + 0x41d6, \ + 0x414c, \ + 0x41b9, \ + 0x4008, \ + 0x4176, \ + 0x417c, \ + 0x4194, \ + 0x4146, \ + 0x4154, \ + 0x41ad, \ + 0x41d4, \ + 0x415e, \ + 0x41c1, \ + 0x4162, \ + 0x413c, \ + 0x41b8, \ + 0x41db, \ + 0x41a4, \ + 0x414c, \ + 0x41d9, \ + 0x41fd, \ + 0x4080, \ + 0x40bc, \ + 0x41d1, \ + 0x408c, \ + 0x41e5, \ + 0x4172, \ + 0x416e, \ + 0x41f2, \ + 0x4181, \ + 0x412c, \ + 0x4132, \ + 0x41b8, \ + 0x41b2, \ + 0x4028, \ + 0x4140, \ + 0x41de, \ + 0x4140, \ + 0x40c0, \ + 0x41c5, \ + 0x419d, \ + 0x4198, \ + 0x4194, \ + 0x4018, \ + 0x414e, \ + 0x41b1, \ + 0x4116, \ + 0x41cc, \ + 0x41aa, \ + 0x41cd, \ + 0x41ef, \ + 0x41f2, \ + 0x4000, \ + 0x4028, \ + 0x4058, \ + 0x4176, \ + 0x4144, \ + 0x3fe0, \ + 0x4152, \ + 0x41a4, \ + 0x411e, \ + 0x4166, \ + 0x41b1, \ + 0x40b4, \ + 0x41ac, \ + 0x41b6, \ + 0x4140, \ + 0x40a8, \ + 0x41d6, \ + 0x414a, \ + 0x40fc, \ + 0x41d2, \ + 0x3e80, \ + 0x4130, \ + 0x41e6, \ + 0x3fa0, \ + 0x4078, \ + 0x4148, \ + 0x40c0, \ + 0x416a, \ + 0x40e0, \ + 0x40cc, \ + 0x41bc, \ + 0x4080, \ + 0x419d, \ + 0x41a8, \ + 0x41e8, \ + 0x415a, \ + 0x40cc, \ + 0x4038, \ + 0x4189, \ + 0x418c, \ + 0x41f8, \ + 0x40a8, \ + 0x4188, \ + 0x40a8, \ + 0x4186, \ + 0x4110, \ + 0x3fa0, \ + 0x4102, \ + 0x41f1, \ + 0x41a5, \ + 0x40a8, \ + 0x41aa, \ + 0x4018, \ + 0x41be, \ + 0x3f00, \ + 0x41be, \ + 0x41df, \ + 0x4162, \ + 0x418d, \ + 0x4198, \ + 0x41b7, \ + 0x41c6, \ + 0x4197, \ + 0x41d2, \ + 0x4178, \ + 0x41c1, \ + 0x4114, \ + 0x4108, \ + 0x41f8, \ + 0x4116, \ + 0x4030, \ + 0x3e80, \ + 0x41c9, \ + 0x4080, \ + 0x41c9, \ + 0x41d6, \ + 0x4068, \ + 0x41ce, \ + 0x418e, \ + 0x3fc0, \ + 0x4181, \ + 0x0000, \ + 0x3fd0, \ + 0x4118, \ + 0x40a4, \ + 0x40f4, \ + 0x4150, \ + 0x4134, \ + 0x419e, \ + 0x40f4, \ + 0x409c, \ + 0x4122, \ + 0x41fd, \ + 0x41f2, \ + 0x3f00, \ + 0x4070, \ + 0x412e, \ + 0x41a4, \ + 0x40e8, \ + 0x41d0, \ + 0x4010, \ + 0x41d1, \ + 0x4106, \ + 0x417c, \ + 0x418a, \ + 0x4030, \ + 0x40fc, \ + 0x41e8, \ + 0x4189, \ + 0x41e9, \ + 0x41ef, \ + 0x40b0, \ + 0x41e7, \ + 0x41ca, \ + 0x413c, \ + 0x4142, \ + 0x4050, \ + 0x41ea, \ + 0x4144, \ + 0x417a, \ + 0x410a, \ + 0x41e5, \ + 0x41ae, \ + 0x410a, \ + 0x418d, \ + 0x4126, \ + 0x415e, \ + 0x41cb, \ + 0x41f2, \ + 0x412e, \ + 0x41df, \ + 0x41e8, \ + 0x418c, \ + 0x419f, \ + 0x41cd, \ + 0x41a5, \ + 0x41e0, \ + 0x41ec, \ + 0x41b9, \ + 0x41f0, \ + 0x40e8, \ + 0x41d9, \ + 0x41b7, \ + 0x40c4, \ + 0x411c, \ + 0x41cd, \ + 0x41d3, \ + 0x41c0, \ + 0x41df, \ + 0x41cb, \ + 0x4154, \ + 0x41fd, \ + 0x41be, \ + 0x41a3, \ + 0x4008, \ + 0x41ad, \ + 0x40f8, \ + 0x41d4, \ + 0x4197, \ + 0x41d5, \ + 0x4128, \ + 0x4144, \ + 0x419e, \ + 0x41af, \ + 0x413e, \ + 0x41f4, \ + 0x40c8, \ + 0x4102, \ + 0x4038, \ + 0x41bb, \ + 0x41c8, \ + 0x41dd, \ + 0x41ca, \ + 0x4168, \ + 0x41e7, \ + 0x41ee, \ + 0x41a9, \ + 0x41ef, \ + 0x4134, \ + 0x3fd0, \ + 0x4191, \ + 0x4080, \ + 0x40f0, \ + 0x41a2, \ + 0x4118, \ + 0x3f90, \ + 0x41d4, \ + 0x4114, \ + 0x40c0, \ + 0x40e8, \ + 0x40c0, \ + 0x41bf, \ + 0x41bd, \ + 0x418a, \ + 0x418d, \ + 0x41f6, \ + 0x4134, \ + 0x41c2, \ + 0x4138, \ + 0x4166, \ + 0x41dd, \ + 0x413c, \ + 0x418f, \ + 0x41f3, \ + 0x4028, \ + 0x4114, \ + 0x41f9, \ + 0x4040, \ + 0x3fa0, \ + 0x4136, \ + 0x4195, \ + 0x41c6, \ + 0x419c, \ + 0x41a9, \ + 0x4183, \ + 0x409c, \ + 0x416c, \ + 0x4152, \ + 0x40b0, \ + 0x41f3, \ + 0x418e, \ + 0x40c4, \ + 0x4058, \ + 0x41aa, \ + 0x4189, \ + 0x4000, \ + 0x41d8, \ + 0x41bd, \ + 0x4048, \ + 0x3f40, \ + 0x0000, \ + 0x3f00, \ + 0x41b4, \ + 0x4102, \ + 0x41cc, \ + 0x4150, \ + 0x4196, \ + 0x4048, \ + 0x4040, \ + 0x41da, \ + 0x4132, \ + 0x41f6, \ + 0x41da, \ + 0x41d5, \ + 0x4080, \ + 0x40c4, \ + 0x41fb, \ + 0x414a, \ + 0x413a, \ + 0x4150, \ + 0x41f1, \ + 0x41c4, \ + 0x4008, \ + 0x413a, \ + 0x41e4, \ + 0x41d4, \ + 0x4018, \ + 0x417a, \ + 0x41c3, \ + 0x41f7, \ + 0x417c, \ + 0x411e, \ + 0x41b5, \ + 0x4156, \ + 0x411a, \ + 0x41b2, \ + 0x40d0, \ + 0x41bb, \ + 0x4116, \ + 0x4170, \ + 0x4156, \ + 0x40b4, \ + 0x41fc, \ + 0x41ce, \ + 0x41d7, \ + 0x4154, \ + 0x41da, \ + 0x4184, \ + 0x4188, \ + 0x417a, \ + 0x3fc0, \ + 0x4094, \ + 0x41e1, \ + 0x4108, \ + 0x4196, \ + 0x414e, \ + 0x4118, \ + 0x41dc, \ + 0x41d2, \ + 0x41eb, \ + 0x3fc0, \ + 0x3fe0, \ + 0x4184, \ + 0x40b4, \ + 0x418b, \ + 0x418c, \ + 0x40e4, \ + 0x4198, \ + 0x4166, \ + 0x4176, \ + 0x41c3, \ + 0x41b2, \ + 0x418f, \ + 0x41b5, \ + 0x3e80, \ + 0x4104, \ + 0x41bc, \ + 0x40a8, \ + 0x41b6, \ + 0x41db, \ + 0x4142, \ + 0x41fe, \ + 0x416e, \ + 0x419c, \ + 0x41cd, \ + 0x414a, \ + 0x417c, \ + 0x3e80, \ + 0x3f20, \ + 0x41dc, \ + 0x4118, \ + 0x3f80, \ + 0x415e, \ + 0x408c, \ + 0x3f20, \ + 0x41d5, \ + 0x412a, \ + 0x41c5, \ + 0x41a8, \ + 0x41b6, \ + 0x418f, \ + 0x40e4, \ + 0x4146, \ + 0x419c, \ + 0x0000, \ + 0x40e8, \ + 0x417e, \ + 0x4190, \ + 0x4102, \ + 0x414c, \ + 0x41c1, \ + 0x4195, \ + 0x4172, \ + 0x4189, \ + 0x4116, \ + 0x41f6, \ + 0x4140, \ + 0x4198, \ + 0x414e, \ + 0x40fc, \ + 0x41e8, \ + 0x4114, \ + 0x41c1, \ + 0x419f, \ + 0x41d8, \ + 0x4108, \ + 0x4080, \ + 0x40c8, \ + 0x41e7, \ + 0x4110, \ + 0x41c8, \ + 0x4000, \ + 0x3f60, \ + 0x41a5, \ + 0x41f5, \ + 0x41a4, \ + 0x41c1, \ + 0x4196, \ + 0x41ec, \ + 0x417c, \ + 0x4183, \ + 0x41a6, \ + 0x41f0, \ + 0x41bf, \ + 0x41a6, \ + 0x41ed, \ + 0x41a2, \ + 0x4140, \ + 0x41c8, \ + 0x4114, \ + 0x40e0, \ + 0x0000, \ + 0x41a1, \ + 0x40d0, \ + 0x4172, \ + 0x408c, \ + 0x4193, \ + 0x40fc, \ + 0x41a1, \ + 0x4196, \ + 0x415e, \ + 0x40b8, \ + 0x0000, \ + 0x4136, \ + 0x412e, \ + 0x40a8, \ + 0x41c3, \ + 0x4094, \ + 0x40fc, \ + 0x41f1, \ + 0x41c3, \ + 0x41df, \ + 0x4154, \ + 0x41bc, \ + 0x40a4, \ + 0x41f1, \ + 0x41d8, \ + 0x41ac, \ + 0x40ac, \ + 0x4128, \ + 0x41aa, \ + 0x41a0, \ + 0x4128, \ + 0x41ce, \ + 0x4191, \ + 0x4126, \ + 0x419e, \ + 0x3f60, \ + 0x40b0, \ + 0x41a8, \ + 0x41e8, \ + 0x41ea, \ + 0x41f8, \ + 0x414e, \ + 0x40b8, \ + 0x419d, \ + 0x416e, \ + 0x41be, \ + 0x4185, \ + 0x418e, \ + 0x4176, \ + 0x415c, \ + 0x41d6, \ + 0x4090, \ + 0x41d1, \ + 0x41fa, \ + 0x409c, \ + 0x40e4, \ + 0x4178, \ + 0x41a6, \ + 0x4126, \ + 0x41ff, \ + 0x4140, \ + 0x4150, \ + 0x41d9, \ + 0x4193, \ + 0x4128, \ + 0x4098, \ + 0x4186, \ + 0x410c, \ + 0x41fd, \ + 0x41b6, \ + 0x41db, \ + 0x40cc, \ + 0x413e, \ + 0x411c, \ + 0x419e, \ + 0x4084, \ + 0x3fa0, \ + 0x40d8, \ + 0x41c7, \ + 0x4196, \ + 0x4185, \ + 0x40fc, \ + 0x41c1, \ + 0x4191, \ + 0x417c, \ + 0x41a6, \ + 0x41e6, \ + 0x4164, \ + 0x41e5, \ + 0x4184, \ + 0x4156, \ + 0x41fb, \ + 0x4114, \ + 0x415e, \ + 0x41a9, \ + 0x3f90, \ + 0x4118, \ + 0x41ea, \ + 0x4108, \ + 0x41f3, \ + 0x3fa0, \ + 0x4000, \ + 0x4156, \ + 0x4068, \ + 0x41e8, \ + 0x4198, \ + 0x41a4, \ + 0x41ea, \ + 0x41c1, \ + 0x41ec, \ + 0x41fe, \ + 0x4166, \ + 0x4080, \ + 0x41a4, \ + 0x41a4, \ + 0x4008, \ + 0x413e, \ + 0x4195, \ + 0x41c4, \ + 0x41ae, \ + 0x4140, \ + 0x41bd, \ + 0x413a, \ + 0x4126, \ + 0x4150, \ + 0x418c, \ + 0x41e8, \ + 0x3f40, \ + 0x40f8, \ + 0x40f8, \ + 0x41b4, \ + 0x4110, \ + 0x4008, \ + 0x4080, \ + 0x41e4, \ + 0x4198, \ + 0x40d0, \ + 0x41df, \ + 0x41ca, \ + 0x414a, \ + 0x40f4, \ + 0x41d7, \ + 0x40e8, \ + 0x41bd, \ + 0x4158, \ + 0x40b4, \ + 0x41c6, \ + 0x41d5, \ + 0x41be, \ + 0x41fe, \ + 0x410e, \ + 0x419f, \ + 0x41d9, \ + 0x418c, \ + 0x4168, \ + 0x41db, \ + 0x418e, \ + 0x41f3, \ + 0x40cc, \ + 0x41aa, \ + 0x41f9, \ + 0x3fc0, \ + 0x4094, \ + 0x41e6, \ + 0x4094, \ + 0x413a, \ + 0x4138, \ + 0x408c, \ + 0x41f5, \ + 0x41e4, \ + 0x4112, \ + 0x40bc, \ + 0x414c, \ + 0x40f0, \ + 0x4070, \ + 0x41d9, \ + 0x4050, \ + 0x4168, \ + 0x4188, \ + 0x41d6, \ + 0x413c, \ + 0x412a, \ + 0x41d2, \ + 0x410a, \ + 0x4190, \ + 0x4189, \ + 0x4030, \ + 0x417c, \ + 0x41b3, \ + 0x40f4, \ + 0x41e1, \ + 0x41a4, \ + 0x4084, \ + 0x4090, \ + 0x41f6, \ + 0x41a8, \ + 0x40a4, \ + 0x41c3, \ + 0x41da, \ + 0x41f4, \ + 0x41e6, \ + 0x3f40, \ + 0x4199, \ + 0x40a8, \ + 0x4048, \ + 0x4187, \ + 0x419d, \ + 0x41bc, \ + 0x3f80, \ + 0x3ec0, \ + 0x41a9, \ + 0x4030, \ + 0x41e5, \ + 0x41ef, \ + 0x41bd, \ + 0x4040, \ + 0x4195, \ + 0x41e9, \ + 0x41f1, \ + 0x418e, \ + 0x40c0, \ + 0x4040, \ + 0x4110, \ + 0x417e, \ + 0x4008, \ + 0x4186, \ + 0x4018, \ + 0x41be, \ + 0x4116, \ + 0x3f40, \ + 0x4186, \ + 0x416a, \ + 0x3f90, \ + 0x40c4, \ + 0x41b8, \ + 0x41a2, \ + 0x4102, \ + 0x41cc, \ + 0x41f6, \ + 0x4078, \ + 0x41be, \ + 0x4188, \ + 0x4112, \ + 0x4192, \ + 0x40c0, \ + 0x413e, \ + 0x4124, \ + 0x41a1, \ + 0x4114, \ + 0x4134, \ + 0x4098, \ + 0x4176, \ + 0x4142, \ + 0x4136, \ + 0x418d, \ + 0x41f2, \ + 0x4132, \ + 0x4195, \ + 0x41d2, \ + 0x3f20, \ + 0x419a, \ + 0x4183, \ + 0x41c0, \ + 0x40ec, \ + 0x41e1, \ + 0x41e3, \ + 0x3e80, \ + 0x4010, \ + 0x40bc, \ + 0x4122, \ + 0x41cb, \ + 0x4162, \ + 0x3fc0, \ + 0x419e, \ + 0x411a, \ + 0x41a7, \ + 0x41b2, \ + 0x416e, \ + 0x41c4, \ + 0x41fd, \ + 0x3fc0, \ + 0x41e5, \ + 0x40ec, \ + 0x41cb, \ + 0x4198, \ + 0x415e, \ + 0x4106, \ + 0x417e, \ + 0x4058, \ + 0x416c, \ + 0x409c, \ + 0x418a, \ + 0x41d3, \ + 0x40f4, \ + 0x4154, \ + 0x0000, \ + 0x419a, \ + 0x41c2, \ + 0x41f4, \ + 0x4174, \ + 0x40d8, \ + 0x41c2, \ + 0x4136, \ + 0x4112, \ + 0x4010, \ + 0x41e9, \ + 0x41a2, \ + 0x4192, \ + 0x41d9, \ + 0x41cc, \ + 0x417c, \ + 0x40dc, \ + 0x41a2, \ + 0x412e, \ + 0x41f2, \ + 0x4191, \ + 0x41aa, \ + 0x414c, \ + 0x4100, \ + 0x41aa, \ + 0x41a3, \ + 0x3f20, \ + 0x3f00, \ + 0x4140, \ + 0x41ee, \ + 0x4164, \ + 0x4174, \ + 0x41ab, \ + 0x40b8, \ + 0x41d2, \ + 0x41f0, \ + 0x4144, \ + 0x40d4, \ + 0x4180, \ + 0x41b7, \ + 0x4197, \ + 0x41cf, \ + 0x3f20, \ + 0x4183, \ + 0x4197, \ + 0x4010, \ + 0x40fc, \ + 0x41b1, \ + 0x4098, \ + 0x4068, \ + 0x40e0, \ + 0x4194, \ + 0x41ae, \ + 0x4187, \ + 0x413c, \ + 0x41a7, \ + 0x40f0, \ + 0x4178, \ + 0x419c, \ + 0x4154, \ + 0x41ee, \ + 0x415a, \ + 0x41b1, \ + 0x40e4, \ + 0x40e4, \ + 0x4018, \ + 0x3fd0, \ + 0x41ed, \ + 0x3e00, \ + 0x414a, \ + 0x4094, \ + 0x41be, \ + 0x41c5, \ + 0x3fa0, \ + 0x41d6, \ + 0x416e, \ + 0x4088, \ + 0x417a, \ + 0x41fa, \ + 0x415a, \ + 0x4000, \ + 0x4100, \ + 0x41ae, \ + 0x41de, \ + 0x41b8, \ + 0x41a7, \ + 0x41d6, \ + 0x418a, \ + 0x41df, \ + 0x41c3, \ + 0x40fc, \ + 0x411c, \ + 0x41a4, \ + 0x4176, \ + 0x415a, \ + 0x419a, \ + 0x4090, \ + 0x40fc, \ + 0x417a, \ + 0x413a, \ + 0x40c0, \ + 0x41d1, \ + 0x41d7, \ + 0x416e, \ + 0x4150, \ + 0x40b0, \ + 0x4166, \ + 0x411e, \ + 0x3ec0, \ + 0x41de, \ + 0x40c4, \ + 0x410e, \ + 0x41b7, \ + 0x41a9, \ + 0x4000, \ + 0x41bb, \ + 0x41c6, \ + 0x4010, \ + 0x41a5, \ + 0x40f0, \ + 0x41ad, \ + 0x41b3, \ + 0x4148 \ +} + +#endif \ No newline at end of file diff --git a/hwpe/softex/softex.c b/hwpe/softex/softex.c new file mode 100644 index 0000000..3126f1a --- /dev/null +++ b/hwpe/softex/softex.c @@ -0,0 +1,95 @@ +/* + * Andrea Belano + * + * Copyright 2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include "stdio.h" +#include "archi_softex.h" +#include "hal_softex.h" +#include "pulp.h" + +#include "inc/golden.h" +#include "inc/scores.h" + +#define TOLERANCE 0x2 + +uint16_t scores_ext [LENGTH] = SCORES; +uint16_t golden [LENGTH] = GOLDEN; + +int main() { + volatile int errors = 0; + + uint16_t volatile *scores = (uint16_t volatile *) pi_l1_malloc(0, (FMT_WIDTH*LENGTH)); + + if(get_core_id() == 0){ + #ifdef USE_DMA + volatile unsigned int dma_id = 0; + dma_id = mchan_alloc(); + + mchan_transfer((unsigned int) FMT_WIDTH*LENGTH , + (unsigned int) scores_ext , + (unsigned int) scores + ); + + mchan_barrier(dma_id); + mchan_free(dma_id); + #else + memcpy(scores, scores_ext, LENGTH*FMT_WIDTH); + #endif + + // Enable softex + hwpe_cg_enable(); + + hwpe_soft_clear(); + + HWPE_WRITE(scores, SOFTEX_IN_ADDR); + HWPE_WRITE(LENGTH * FMT_WIDTH, SOFTEX_TOT_LEN); + HWPE_WRITE(scores, SOFTEX_OUT_ADDR); + + hwpe_trigger_job(); + + softex_evt_wait(); + + // Disable softex + hwpe_cg_disable(); + + for (int i = 0; i < LENGTH; i++) { + uint16_t diff; + + if (golden [i] >= scores[i]) { + diff = golden [i] - scores [i]; + } else { + diff = scores [i] - golden [i]; + } + + if (diff > TOLERANCE) { + errors += 1; + + printf ("Mismatch!!!\tIndex: %d\tExpected: 0x%04x\tWas: 0x%04x\tDifference: 0x%x\n", i, golden [i], scores [i], diff); + } + } + + *(int *) 0x1A1040A0 = errors; + + printf("Test completed with %d errors\n", errors); + } + synch_barrier(); + return errors; +} diff --git a/idma-tests.yaml b/idma-tests.yaml new file mode 100644 index 0000000..6bef5c3 --- /dev/null +++ b/idma-tests.yaml @@ -0,0 +1,8 @@ +idma_tests: + idma_simple: + path: ./idma_tests/idma_simple + command: make clean all run + idma_zeromem: + path: ./idma_tests/idma_zeromem + command: make clean all run + diff --git a/idma_tests/README.md b/idma_tests/README.md new file mode 100644 index 0000000..74096cd --- /dev/null +++ b/idma_tests/README.md @@ -0,0 +1,20 @@ +## IDMA Tests + +This folder contains basic tests for the iDMA IP. +Currently, the following are supported: +- **1D transfers** +- **2D transfers** +- **3D transfers** + +To launch each test: +1. Move into the related folder. +2. Launch the following: + 1. `make stimuli` : this will generate the randomized stimuli for the test. The randomized stimuli consist of transfer sizes, number of transfers to be executed, n-dimensional strides, etc ... + 2. `make all` : this will compile the C code. Few choices are available: + - **Single Core mode**: no flags are needed. All transfers specified in the stimuli will be executed by Core 0 only. + - **Sequential Multi-Core Mode**: specify **MULTI_CORE_S=1** in the command line when compiling the code. In this mode, all cores will execute the transfers specified in the stimuli in a sequential manner. + - **Parallel Multi-Core Mode**: specify **MULTI_CORE_P=1** in the command line when compiling the code. In this mode, all cores will execute the transfers specified in the stimulin in a parallel manner. + 3. `make run` : this will launch the simulation in bash mode (use gui=1 for Modelsim gui). +4. All transfers will be executed in the three different directions that are currently supported: **L1->L2, L2->L1, L1->L1**. + +Updated drivers for the iDMA can be found at **pulp_cluster/pulp-runtime/include/hal/dma/idma_v2.h** and **pulp_cluster/pulp-runtime/include/archi/dma/idma_v2.h**. diff --git a/idma_tests/idma_2D_FC_TCDM/Makefile b/idma_tests/idma_2D_FC_TCDM/Makefile new file mode 100755 index 0000000..8a5210c --- /dev/null +++ b/idma_tests/idma_2D_FC_TCDM/Makefile @@ -0,0 +1,22 @@ +PULP_APP = test + +TEST_SRCS ?= idma_2D_FC_TCDM.c +PULP_APP_SRCS = $(TEST_SRCS) +PULP_APP_FC_SRCS = $(TEST_FC_SRCS) +ifdef TEST_FC_SRCS +pulpFc=1 +endif + +space := +space += + +#BUILD_DIR = $(subst $(space),_,$(CURDIR)/build/$(TEST_SRCS)) + +ifdef VERBOSE +PULP_CFLAGS += -DVERBOSE +endif + +PULP_CFLAGS += -O3 +stackSize = 4096 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/idma_tests/idma_2D_FC_TCDM/idma_2D_FC_TCDM.c b/idma_tests/idma_2D_FC_TCDM/idma_2D_FC_TCDM.c new file mode 100644 index 0000000..8e41e26 --- /dev/null +++ b/idma_tests/idma_2D_FC_TCDM/idma_2D_FC_TCDM.c @@ -0,0 +1,101 @@ +#include +#include +#include +#include "pulp.h" + +// #define VERBOSE + +#define MAX_BUFFER_SIZE 0x2000 + +L2_DATA static uint8_t ext[MAX_BUFFER_SIZE]; +L1_DATA static uint8_t loc[MAX_BUFFER_SIZE]; + +// Transfer types +typedef enum {L2_TO_L1, L1_TO_L2} test_type_t; + +#ifdef VERBOSE +static void print_memory(uint8_t *mem, unsigned int size, const char *name) { + printf("Memory dump of %s:\n", name); + for (unsigned int i = 0; i < size; i++) { + printf("0x%02X ", mem[i]); + if ((i + 1) % 16 == 0) { + printf("\n"); + } + } + printf("\n"); +} +#endif + +int test_idma_2D(uint32_t size, test_type_t type, uint32_t ext_addr, uint32_t tcdm_addr, + unsigned int length, unsigned int src_stride, unsigned int dst_stride, unsigned int num_reps); + +int main(void) { + if (rt_cluster_id() != 0) { + return bench_cluster_forward(0); + } + + int error_count = 0; + + if (get_core_id() == 0) { + unsigned int sizes[] = {16, 64, 256, 1024}; // Total number of bytes (for initialization of the values in memory) + unsigned int lengths[] = {4, 8, 16, 64}; // Number of bytes to transfer per 1D transaction + unsigned int num_reps[] = {4, 8, 16, 16}; // Number of repetitions of 1D transactions + unsigned int src_strides[] = {4, 8, 24, 56}; // Number of strides in source memory + unsigned int dst_strides[] = {4, 12, 24, 64}; // Number of strides in destination memory + + for (unsigned int i = 0; i < 4; i++) { + error_count += test_idma_2D(sizes[i], L2_TO_L1, (uint32_t)ext, (uint32_t)loc, + lengths[i], src_strides[i], dst_strides[i], num_reps[i]); + } + + for (unsigned int i = 0; i < 4; i++) { + error_count += test_idma_2D(sizes[i], L1_TO_L2, (uint32_t)ext, (uint32_t)loc, + lengths[i], src_strides[i], dst_strides[i], num_reps[i]); + } + } + + return error_count; +} + +int test_idma_2D(uint32_t size, test_type_t type, uint32_t ext_addr, uint32_t tcdm_addr, + unsigned int length, unsigned int src_stride, unsigned int dst_stride, unsigned int num_reps) { + volatile uint8_t *src_ptr, *dst_ptr; + volatile int error = 0; + volatile unsigned int id; + + if (type == L2_TO_L1) { + src_ptr = (uint8_t *)ext_addr; + dst_ptr = (uint8_t *)tcdm_addr; + for (uint32_t i = 0; i < size; i++) src_ptr[i] = (uint8_t)(i & 0xFF); + id = pulp_cl_idma_L2ToL1_2d((unsigned int)src_ptr, (unsigned int)dst_ptr, length, src_stride, dst_stride, num_reps); + plp_cl_dma_wait_toL1(id); + } else { + src_ptr = (uint8_t *)tcdm_addr; + dst_ptr = (uint8_t *)ext_addr; + for (uint32_t i = 0; i < size; i++) src_ptr[i] = (uint8_t)(i & 0xFF); + id = pulp_cl_idma_L1ToL2_2d((unsigned int)src_ptr, (unsigned int)dst_ptr, length, src_stride, dst_stride, num_reps); + plp_cl_dma_wait_toL2(id); + } + + // plp_dma_barrier(); + + for (unsigned int rep = 0; rep < num_reps; rep++) { + unsigned int src_offset = rep * src_stride; + unsigned int dst_offset = rep * dst_stride; + for (unsigned int i = 0; i < length; i++) { + uint8_t expected = src_ptr[src_offset + i]; + uint8_t actual = dst_ptr[dst_offset + i]; + if (expected != actual) error++; + } + } + +#ifdef VERBOSE + if (error == 0) { + printf("Test passed for %s with size %d, length %d, src_stride %d, dst_stride %d, num_reps %d\n", type == L2_TO_L1 ? "L2_TO_L1" : "L1_TO_L2", size, length, src_stride, dst_stride, num_reps); + } else { + printf("Test failed for %s with size %d, length %d, src_stride %d, dst_stride %d, num_reps %d\n", type == L2_TO_L1 ? "L2_TO_L1" : "L1_TO_L2", size, length, src_stride, dst_stride, num_reps); + } +#endif + + return error; +} \ No newline at end of file diff --git a/idma_tests/idma_3D_FC_TCDM/Makefile b/idma_tests/idma_3D_FC_TCDM/Makefile new file mode 100755 index 0000000..02ba50d --- /dev/null +++ b/idma_tests/idma_3D_FC_TCDM/Makefile @@ -0,0 +1,22 @@ +PULP_APP = test + +TEST_SRCS ?= idma_3D_FC_TCDM.c +PULP_APP_SRCS = $(TEST_SRCS) +PULP_APP_FC_SRCS = $(TEST_FC_SRCS) +ifdef TEST_FC_SRCS +pulpFc=1 +endif + +space := +space += + +#BUILD_DIR = $(subst $(space),_,$(CURDIR)/build/$(TEST_SRCS)) + +ifdef VERBOSE +PULP_CFLAGS += -DVERBOSE +endif + +PULP_CFLAGS += -O3 +stackSize = 4096 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/idma_tests/idma_3D_FC_TCDM/idma_3D_FC_TCDM.c b/idma_tests/idma_3D_FC_TCDM/idma_3D_FC_TCDM.c new file mode 100644 index 0000000..ae5131b --- /dev/null +++ b/idma_tests/idma_3D_FC_TCDM/idma_3D_FC_TCDM.c @@ -0,0 +1,166 @@ +#include +#include +#include +#include "pulp.h" + +// #define VERBOSE + +#define MAX_BUFFER_SIZE 0x2000 + +L2_DATA static uint8_t ext[MAX_BUFFER_SIZE]; +L1_DATA static uint8_t loc[MAX_BUFFER_SIZE]; + +// Transfer types +typedef enum {L2_TO_L1, L1_TO_L2} test_type_t; + +#ifdef VERBOSE +static void print_memory(uint8_t *mem, unsigned int size, const char *name) { + printf("Memory dump of %s:\n", name); + for (unsigned int i = 0; i < size; i++) { + printf("0x%02X ", mem[i]); + if ((i + 1) % 16 == 0) { + printf("\n"); + } + } + printf("\n"); +} +#endif + +int test_idma_3D(uint32_t size, test_type_t type, uint32_t ext_addr, uint32_t tcdm_addr, + unsigned int length, unsigned int src_stride_2d, unsigned int dst_stride_2d, unsigned int num_reps, + unsigned int src_stride_3d, unsigned int dst_stride_3d, unsigned int num_reps_3d); + +int main(void) { + if (rt_cluster_id() != 0) { + return bench_cluster_forward(0); + } + + int error_count = 0; + + if (get_core_id() == 0) { + unsigned int sizes[] = {64, 256, 1024, 2048}; // Total number of bytes + unsigned int lengths[] = {4, 8, 16, 32}; // Number of bytes per 1D transaction + unsigned int src_strides_2d[] = {8, 16, 32, 64}; // 2D stride in source + unsigned int dst_strides_2d[] = {8, 16, 32, 64}; // 2D stride in destination + unsigned int num_reps_3d[] = {2, 2, 2, 2}; // Number of 2D pages + unsigned int src_strides_3d[] = {32, 128, 512, 1024}; // 3D stride in source + unsigned int dst_strides_3d[] = {32, 128, 512, 1024}; // 3D stride in destination + + for (unsigned int i = 0; i < 4; i++) { + unsigned int num_reps = sizes[i] / lengths[i]; // Calculate num_reps like in multi_core test + + printf("Transfer: %d\n", i); + printf("Size: %d | Length: %d | Src_stride_2d: %d | Dst_stride_2d: %d | Num_reps_2d: %d\n", + sizes[i], lengths[i], src_strides_2d[i], dst_strides_2d[i], num_reps); + printf("Src_stride_3d: %d | Dst_stride_3d: %d | Num_reps_3d: %d\n", + src_strides_3d[i], dst_strides_3d[i], num_reps_3d[i]); + + printf("L2 to L1 transfer\n"); + error_count += test_idma_3D(sizes[i], L2_TO_L1, (uint32_t)ext, (uint32_t)loc, + lengths[i], src_strides_2d[i], dst_strides_2d[i], num_reps, + src_strides_3d[i], dst_strides_3d[i], num_reps_3d[i]); + + printf("L1 to L2 transfer\n"); + error_count += test_idma_3D(sizes[i], L1_TO_L2, (uint32_t)ext, (uint32_t)loc, + lengths[i], src_strides_2d[i], dst_strides_2d[i], num_reps, + src_strides_3d[i], dst_strides_3d[i], num_reps_3d[i]); + } + } + + return error_count; +} + +int test_idma_3D(uint32_t size, test_type_t type, uint32_t ext_addr, uint32_t tcdm_addr, + unsigned int length, unsigned int src_stride_2d, unsigned int dst_stride_2d, unsigned int num_reps, + unsigned int src_stride_3d, unsigned int dst_stride_3d, unsigned int num_reps_3d) { + volatile uint8_t *src_ptr, *dst_ptr; + unsigned int offset_2d, offset_3d; + int error = 0; + + if (type == L2_TO_L1) { + // L2 to L1 transfer + src_ptr = (uint8_t*)ext_addr; + dst_ptr = (uint8_t*)tcdm_addr; + + // Fill source region with test data + offset_2d = 0; + offset_3d = 0; + for (int j = 0; j < num_reps_3d; j++) { + for (int q = 0; q < num_reps; q++) { + for (int i = 0; i < length; i++) { + src_ptr[i + offset_2d + offset_3d] = (uint8_t)(i & 0xFF); + } + offset_2d += src_stride_2d; + } + offset_2d = 0; + offset_3d += (num_reps - 1) * src_stride_2d + src_stride_3d; + } + + plp_cl_dma_wait_toL1(pulp_cl_idma_L2ToL1_3d((unsigned int)src_ptr, (unsigned int)dst_ptr, + length, src_stride_2d, dst_stride_2d, num_reps, + src_stride_3d, dst_stride_3d, num_reps_3d)); + } else { + // L1 to L2 transfer + src_ptr = (uint8_t*)tcdm_addr; + dst_ptr = (uint8_t*)ext_addr; + + // Fill source region with test data + offset_2d = 0; + offset_3d = 0; + for (int j = 0; j < num_reps_3d; j++) { + for (int q = 0; q < num_reps; q++) { + for (int i = 0; i < length; i++) { + src_ptr[i + offset_2d + offset_3d] = (uint8_t)(i & 0xFF); + } + offset_2d += src_stride_2d; + } + offset_2d = 0; + offset_3d += (num_reps - 1) * src_stride_2d + src_stride_3d; + } + + plp_cl_dma_wait_toL2(pulp_cl_idma_L1ToL2_3d((unsigned int)src_ptr, (unsigned int)dst_ptr, + length, src_stride_2d, dst_stride_2d, num_reps, + src_stride_3d, dst_stride_3d, num_reps_3d)); + } + + // Check the results - exactly like multi_core test + int src_offset_2d = 0; + int dst_offset_2d = 0; + int src_offset_3d = 0; + int dst_offset_3d = 0; + + for (int rep_3d = 0; rep_3d < num_reps_3d; rep_3d++) { + for (unsigned int rep = 0; rep < num_reps; rep++) { + for (unsigned int i = 0; i < length; i++) { + uint8_t expected = src_ptr[src_offset_2d + src_offset_3d + i]; + uint8_t actual = dst_ptr[dst_offset_2d + dst_offset_3d + i]; + + if (expected != actual) { +#ifdef VERBOSE + printf("ERROR: expected @%8x[%d] = %8x vs actual @%8x[%d] = %8x\n", + &src_ptr[src_offset_2d + src_offset_3d + i], src_offset_2d + src_offset_3d + i, expected, + &dst_ptr[dst_offset_2d + dst_offset_3d + i], dst_offset_2d + dst_offset_3d + i, actual); +#endif + error++; + } + } + src_offset_2d += src_stride_2d; + dst_offset_2d += dst_stride_2d; + } + src_offset_2d = 0; + dst_offset_2d = 0; + src_offset_3d += (num_reps - 1) * src_stride_2d + src_stride_3d; + dst_offset_3d += (num_reps - 1) * dst_stride_2d + dst_stride_3d; + } + +#ifdef VERBOSE + if (error == 0) { + printf("Test passed for %s 3D transfer\n", type == L2_TO_L1 ? "L2_TO_L1" : "L1_TO_L2"); + } else { + printf("Test failed for %s 3D transfer with %d errors\n", + type == L2_TO_L1 ? "L2_TO_L1" : "L1_TO_L2", error); + } +#endif + + return error; +} \ No newline at end of file diff --git a/idma_tests/idma_TCDM_to_TCDM/Makefile b/idma_tests/idma_TCDM_to_TCDM/Makefile new file mode 100755 index 0000000..4249b6f --- /dev/null +++ b/idma_tests/idma_TCDM_to_TCDM/Makefile @@ -0,0 +1,22 @@ +PULP_APP = test + +TEST_SRCS ?= idma_TCDM_to_TCDM.c +PULP_APP_SRCS = $(TEST_SRCS) +PULP_APP_FC_SRCS = $(TEST_FC_SRCS) +ifdef TEST_FC_SRCS +pulpFc=1 +endif + +space := +space += + +#BUILD_DIR = $(subst $(space),_,$(CURDIR)/build/$(TEST_SRCS)) + +ifdef VERBOSE +PULP_CFLAGS += -DVERBOSE +endif + +PULP_CFLAGS += -O3 +stackSize = 4096 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/idma_tests/idma_TCDM_to_TCDM/idma_TCDM_to_TCDM.c b/idma_tests/idma_TCDM_to_TCDM/idma_TCDM_to_TCDM.c new file mode 100644 index 0000000..ae94f01 --- /dev/null +++ b/idma_tests/idma_TCDM_to_TCDM/idma_TCDM_to_TCDM.c @@ -0,0 +1,73 @@ + +#include +#include +#include +#include "pulp.h" + +#define VERBOSE + +#define MAX_BUFFER_SIZE 0x2000 +#define BUFFER_SIZE 0x0200 + +L1_DATA static uint8_t src_buffer[MAX_BUFFER_SIZE]; +L1_DATA static uint8_t dst_buffer[MAX_BUFFER_SIZE]; + +int test_idma_l1_to_l1(unsigned int len); + +int main(void) { + if (rt_cluster_id() != 0) { + return bench_cluster_forward(0); + } + + int error_count = 0; + + if (get_core_id() == 0) { + // Run the test for different buffer sizes + for (unsigned int len = 8; len <= BUFFER_SIZE; len *= 2) { + error_count += test_idma_l1_to_l1(len); + } + } + + return error_count; +} + +int test_idma_l1_to_l1(unsigned int len) { + volatile unsigned int id; + volatile int error = 0; + volatile uint32_t test, read; + + printf("STARTING L1 TO L1 DMA TEST FOR LENGTH: %d\n", len); + + // Fill source buffer with known data pattern + for (unsigned int i = 0; i < len / 4; i++) { + ((uint32_t *)src_buffer)[i] = 0xABCD0000 + i; + } + + // Clear destination buffer + memset(dst_buffer, 0, len); + + // Perform DMA transfer from src_buffer to dst_buffer within L1 memory + id = pulp_cl_idma_L1ToL1((unsigned int)src_buffer, (unsigned int)dst_buffer, len); + + // Wait for DMA transfer to complete + plp_cl_dma_barrier(); + + // Verify data in destination buffer + for (unsigned int i = 0; i < len / 4; i++) { + test = 0xABCD0000 + i; + read = ((uint32_t *)dst_buffer)[i]; + + if (test != read) { + printf("Error!!! Read: %x, Expected: %x, Index: %d\n", read, test, i); + error++; + } + } + + if (error == 0) { + printf("L1 to L1 DMA transfer test passed for length %d.\n", len); + } else { + printf("L1 to L1 DMA transfer test failed for length %d with %d errors.\n", len, error); + } + + return error; +} diff --git a/idma_tests/idma_allcores/Makefile b/idma_tests/idma_allcores/Makefile new file mode 100755 index 0000000..ea1e235 --- /dev/null +++ b/idma_tests/idma_allcores/Makefile @@ -0,0 +1,22 @@ +PULP_APP = test + +TEST_SRCS ?= allcores.c +PULP_APP_SRCS = $(TEST_SRCS) +PULP_APP_FC_SRCS = $(TEST_FC_SRCS) +ifdef TEST_FC_SRCS +pulpFc=1 +endif + +space := +space += + +#BUILD_DIR = $(subst $(space),_,$(CURDIR)/build/$(TEST_SRCS)) + +ifdef VERBOSE +PULP_CFLAGS += -DVERBOSE +endif + +PULP_CFLAGS += -O3 +stackSize = 4096 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/idma_tests/idma_allcores/allcores.c b/idma_tests/idma_allcores/allcores.c new file mode 100644 index 0000000..f833dd5 --- /dev/null +++ b/idma_tests/idma_allcores/allcores.c @@ -0,0 +1,112 @@ +#include +#include "pulp.h" +//#include "mchan_tests.h" + +#define VERBOSE + +#define MAX_BUFFER_SIZE 0x2200 + +//static unsigned char *ext; +//static unsigned char *loc; + +//#define EXT_DATA_ADDR ((unsigned int)ext) +//#define TCDM_DATA_ADDR ((unsigned int)loc) + +L2_DATA static unsigned char ext[MAX_BUFFER_SIZE]; +L1_DATA static unsigned char loc[MAX_BUFFER_SIZE]; +L1_DATA static int error_count = 0; +#define EXT_DATA_ADDR ((unsigned int) ext) +#define TCDM_DATA_ADDR ((unsigned int) loc) +typedef enum {RX, TX, TCDM} test_type_t; + +int test_idma_1d(unsigned int len, test_type_t type, unsigned int ext_addr, unsigned int tcdm_addr); +int main() +{ + + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + int n_bytes [] = {5, 19, 400}; + + for (int core=0; core +#include +#include +#include +#include "pulp.h" + +#define VERBOSE + +#define MAX_BUFFER_SIZE 0x2000 + +L2_DATA static uint8_t ext[MAX_BUFFER_SIZE]; // External memory buffer (L2) +L1_DATA static uint8_t loc[MAX_BUFFER_SIZE]; // Local memory buffer (TCDM / L1) + +typedef enum {L2_TO_L1, L1_TO_L2} test_type_t; + +int test_idma(uint32_t size, test_type_t type, uint32_t ext_addr, uint32_t tcdm_addr); + +int main(void) { + if (rt_cluster_id() != 0) { + printf("Test only runs on cluster 0\n"); + return bench_cluster_forward(0); + } + + int error_count = 0; + + if (get_core_id() == 0) { + + uint32_t size = 5; + // Test for L2_TO_L1 operation + for (int i = 0; i < size; i++) { + error_count += test_idma(size, L2_TO_L1, (uint32_t)ext, (uint32_t)loc); + } + + // Test for L1_TO_L2 operation + for (int i = 0; i < size; i++) { + error_count += test_idma(size, L1_TO_L2, (uint32_t)ext, (uint32_t)loc); + } + } + + return error_count; +} + +int test_idma(uint32_t size, test_type_t type, uint32_t ext_addr, uint32_t tcdm_addr) { + volatile uint8_t expected, actual; + volatile int error = 0; + volatile unsigned int id; + + if (type == L2_TO_L1) { + + for (uint32_t i = 0; i < size; i++) { + *(uint8_t *)(ext_addr + i) = (uint8_t)(i & 0xFF); + } + + for (uint32_t i = 0; i < size; i++) { + *(uint8_t *)(tcdm_addr + i) = 0; + } + + // memset((void *)tcdm_addr, 0, size + 16); + + id = pulp_idma_memcpy(ext_addr, tcdm_addr, size, IDMA_PROT_AXI, IDMA_PROT_OBI); + + /* L2_TO_L1 transaction: + +----------------+ +----------------+ + | | DMA Transfer | | + | L2 Memory | ---------------------> | L1 Memory | + | (ext buffer) | (pulp_idma_memcpy) | (loc buffer) | + | | | | + +----------------+ +----------------+ + */ + plp_cl_dma_wait_toL1(id); + } else if (type == L1_TO_L2) { + // Fill L1 buffer with a pattern + for (uint32_t i = 0; i < size; i++) { + *(uint8_t *)(tcdm_addr + i) = (uint8_t)(i & 0xFF); + } + + // Clear L2 + memset((void *)ext_addr, 0, size + 16); + + // Perform DMA transfer from local memory to external memory + id = pulp_idma_memcpy(tcdm_addr, ext_addr, size, IDMA_PROT_OBI, IDMA_PROT_AXI); + + /* L1_TO_L2 transaction: + +----------------+ +----------------+ + | | DMA Transfer | | + | L2 Memory | <--------------------- | L1 Memory | + | (ext buffer) | (pulp_idma_memcpy) | (loc buffer) | + | | | | + +----------------+ +----------------+ + */ + plp_cl_dma_wait_toL2(id); + } else { + printf("Invalid test type. It must be either L2_TO_L1 or L1_TO_L2.\n"); + return 1; + } + + // plp_cl_dma_wait(); + // plp_dma_barrier(); + + // Verify data + for (uint32_t i = 0; i < size; i++) { + expected = (uint8_t)(i & 0xFF); // Same pattern as L2_TO_L1 + if (type == L2_TO_L1) { + // Reading from the local memory buffer "loc" allocated in L1 memory + actual = *(uint8_t *)(tcdm_addr + i); + } else { // L1_TO_L2 + // Reading from the external memory buffer "ext" allocated in L2 memory + actual = *(uint8_t *)(ext_addr + i); + } + + if (expected != actual) { + printf("Error at index %u: Expected 0x%02X, Got 0x%02X\n", i, expected, actual); + error++; + } + } + + if (error == 0) { + printf("Test passed for %s of length %u.\n", type == L2_TO_L1 ? "L2_TO_L1" : "L1_TO_L2", size); + } else { + printf("Test failed for %s of length %u with %u errors.\n", type == L2_TO_L1 ? "L2_TO_L1" : "L1_TO_L2", size, error); + } + + return error; +} \ No newline at end of file diff --git a/idma_tests/idma_multi_core/Makefile b/idma_tests/idma_multi_core/Makefile new file mode 100644 index 0000000..a1092e3 --- /dev/null +++ b/idma_tests/idma_multi_core/Makefile @@ -0,0 +1,31 @@ +PULP_APP = test +TEST_SRCS ?= idma_multi_core.c +PULP_APP_SRCS = $(TEST_SRCS) + +ifdef VERBOSE +PULP_CFLAGS += -DVERBOSE +endif + +ifdef MULTI_CORE_P +PULP_CFLAGS += -DMULTI_CORE_P +endif + +ifdef MULTI_CORE_S +PULP_CFLAGS += -DMULTI_CORE_S +endif + +ifdef SINGLE_CORE +PULP_CFLAGS += -DSINGLE_CORE +endif + +ifdef QUICK_MODE +PULP_CFLAGS += -DQUICK_MODE +endif + +PULP_CFLAGS += -O3 +stackSize = 4096 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk + +stimuli: + python gen_stimuli_idma.py diff --git a/idma_tests/idma_multi_core/gen_stimuli_idma.py b/idma_tests/idma_multi_core/gen_stimuli_idma.py new file mode 100644 index 0000000..53ce234 --- /dev/null +++ b/idma_tests/idma_multi_core/gen_stimuli_idma.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +import sys +import random +import argparse +import math +import re + + +parser = argparse.ArgumentParser(description='Generate stimuli for iDMA simple transfer') + +args = parser.parse_args() + +def write_transfer_sizes_array(f, name, arr): + f.write ('unsigned int %s[] = {\n' % name) + for v in arr: + random_int = random.randint(1, TRANSFER_SIZE) + f.write('%d, \n' % random_int) + f.write('};\n\n') + return + +def write_define(f, name,val): + f.write('#define %s %d\n\n' % (name,val)) + return + +# Randomize the number of transfers to execute +NB_TRANSFERS = random.randint(1, 10) + +# Randomize between 1 and 1024 the size (in bytes) of each transfer (Must be less than CORE_SPACE defined in the C header) +TRANSFER_SIZE = 1024 + +sizes = [None] * NB_TRANSFERS + +f_sizes = open('idma_parameters.h', 'w') +f_defines = open ('idma_defines.h', 'w') + +write_transfer_sizes_array(f_sizes, 'sizes', sizes) +write_define(f_defines, 'NB_TRANSFERS', NB_TRANSFERS) diff --git a/idma_tests/idma_multi_core/idma_defines.h b/idma_tests/idma_multi_core/idma_defines.h new file mode 100644 index 0000000..b544610 --- /dev/null +++ b/idma_tests/idma_multi_core/idma_defines.h @@ -0,0 +1,2 @@ +#define NB_TRANSFERS 7 + diff --git a/idma_tests/idma_multi_core/idma_multi_core.c b/idma_tests/idma_multi_core/idma_multi_core.c new file mode 100644 index 0000000..079c250 --- /dev/null +++ b/idma_tests/idma_multi_core/idma_multi_core.c @@ -0,0 +1,187 @@ +#include "idma_multi_core.h" + +#define TOT_SIZE 8 * CORE_SPACE + +int errors[8] = {0}; +int test_status = 8; + +uint32_t l1_addr[8] = {0}; +uint32_t l1_dst_addr[8] = {0}; +uint32_t l2_addr[8] = {0}; + +int test_idma_1D (int core_id, uint32_t size, int ext2loc, int loc2loc) { + volatile uint8_t *src_ptr, *dst_ptr; + + int error = 0; + + if (loc2loc == 1) { + // L1 to L1 transfer + src_ptr = (uint8_t*) l1_addr[core_id]; + dst_ptr = (uint8_t*) l1_dst_addr[core_id]; + } else if (ext2loc == 1) { + // L2 to L1 transfer + src_ptr = (uint8_t*) l2_addr[core_id]; + dst_ptr = (uint8_t*) l1_addr[core_id]; + } else { + // L1 to L2 transfer + src_ptr = (uint8_t*) l1_addr[core_id]; + dst_ptr = (uint8_t*) l2_addr[core_id]; + } + + // Fill source region with test data + for (int i = 0; i < size; i++) { + src_ptr[i] = (uint8_t)(i & 0xFF); + } + + if (loc2loc == 1) { + plp_cl_dma_wait_toL1(pulp_cl_idma_L1ToL1((unsigned int) src_ptr, (unsigned int) dst_ptr, size)); + } else if (ext2loc == 1) { + plp_cl_dma_wait_toL1(pulp_cl_idma_L2ToL1((unsigned int) src_ptr, (unsigned int) dst_ptr, size)); + } else { + plp_cl_dma_wait_toL2(pulp_cl_idma_L1ToL2((unsigned int) src_ptr, (unsigned int) dst_ptr, size)); + } + + // Check the results + + for (int i=0; i < size; i++) { + uint8_t expected = src_ptr[i]; + uint8_t actual = dst_ptr[i]; + + if (expected != actual) { + error++; + if (core_id == 0) { + PRINTF ("Error: expected @%8x = %8x vs actual @%8x = %8x \n", expected, &src_ptr[i], actual, &dst_ptr[i]); + } + } + } + + return error; +} + +void allocate_mem_to_cores () { + int core_id = rt_core_id(); + + // Pre-allocate TOT_SIZE = 8 * CORE_SPACE: then we split this window to assign + // each core its available space for iDMA transfers + // pi_l1_malloc starts allocating from 0x10004008 in L1 + // pi_l2_malloc starts allocating from 0x1c000a60 in L2 + + if (core_id == 0) { + l1_addr[0] = (uint32_t) pi_l1_malloc(0, TOT_SIZE); + l1_dst_addr[0] = (uint32_t) pi_l1_malloc(0, TOT_SIZE); + l2_addr[0] = (uint32_t) pi_l2_malloc(TOT_SIZE); + } + + // The following synch_barrier is needed so that + // no core can assign its address range until the mallocs are executed + synch_barrier(); + + l1_addr[core_id] = l1_addr[0] + core_id * CORE_SPACE; + l1_dst_addr[core_id] = l1_dst_addr[0] + core_id * CORE_SPACE; + l2_addr[core_id] = l2_addr[0] + core_id * CORE_SPACE; + + // The following synch_barrier is needed so that + // no core can start executing until all address ranges have been assigned + synch_barrier(); + +} + +void free_allocated_memory () { + + synch_barrier(); + // Only Core 0 takes care of freeing the allocated memory, since it's the one + // allocated it at the beginning of the test + if (core_id == 0) { + pi_l1_free(0, l1_addr, TOT_SIZE); + pi_l1_free(0, l1_dst_addr, TOT_SIZE); + pi_l2_free(l2_addr, TOT_SIZE); + } +} + +int main () { + int core_id = rt_core_id(); + unsigned int size = 0; + + allocate_mem_to_cores(); + + #ifdef MULTI_CORE_P + // MULTI CORE PARALLEL MODE: each core uses the iDMA in a parallel manner + if (core_id == 0) { + PRINTF ("MULTI CORE PARALLEL MODE \n"); + } + for (int k = 0; k < TRANSFERS; k++) { + #ifdef QUICK_MODE + size = idma_presets[k]; + #else + size = sizes[k]; + #endif + if (core_id == 0){ + PRINTF ("Size: %d \n", size); + } + // L1 -> L2 + errors[core_id] += test_idma_1D(core_id, size, 0, 0); + // L2 -> L1 + errors[core_id] += test_idma_1D(core_id, size, 1, 0); + // L1 -> L1 + errors[core_id] += test_idma_1D(core_id, size, 0, 1); + } + synch_barrier(); + #elif MULTI_CORE_S + // MULTI CORE SERIAL MODE: each core uses the iDMA in a serial manner + if (core_id == 0) { + PRINTF ("MULTI CORE SERIAL MODE \n"); + } + for (int i = 0; i < 8; i++) { + if (core_id == i) { + for (int k = 0; k < TRANSFERS; k++) { + #ifdef QUICK_MODE + size = idma_presets[k]; + #else + size = sizes[k]; + #endif + if (core_id == 0){ + PRINTF ("Size: %d \n", size); + } + // L1 -> L2 + errors[core_id] += test_idma_1D(core_id, size, 0, 0); + // L2 -> L1 + errors[core_id] += test_idma_1D(core_id, size, 1, 0); + // L1 -> L1 + errors[core_id] += test_idma_1D(core_id, size, 0, 1); + } + } + } + #else + if (core_id == 0) { + // SINGLE CORE MODE: just core 0 uses the iDMA + PRINTF ("SINGLE CORE MODE: CORE 0 \n"); + for (int k = 0; k < TRANSFERS; k++) { + #ifdef QUICK_MODE + size = idma_presets[k]; + #else + size = sizes[k]; + #endif + PRINTF ("Size: %d \n", size); + // L1 -> L2 + errors[core_id] += test_idma_1D(core_id, size, 0, 0); + // L2 -> L1 + errors[core_id] += test_idma_1D(core_id, size, 1, 0); + // L1 -> L1 + errors[core_id] += test_idma_1D(core_id, size, 0, 1); + } + } + #endif + + if (core_id == 0) { + for (int i = 0; i<8; i++) { + if (errors[i] == 0) { + PRINTF ("Core %d returned %d errors \n", i, errors[i]); + test_status--; + } + } + } + + free_allocated_memory(); + + return test_status; +} diff --git a/idma_tests/idma_multi_core/idma_multi_core.h b/idma_tests/idma_multi_core/idma_multi_core.h new file mode 100644 index 0000000..af672e0 --- /dev/null +++ b/idma_tests/idma_multi_core/idma_multi_core.h @@ -0,0 +1,48 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +//Author: Andreas Kuster +// +//Description: Generated register defines for dma_frontend + +#include "pulp.h" +#include +#include +#include +#include +#include + +#include "idma_defines.h" +#include "idma_parameters.h" +#include "idma_presets.h" + +#ifndef _DMA_FRONTEND_REG_DEFS_ +#define _DMA_FRONTEND_REG_DEFS_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DMA_CONF_DECOUPLE 0 +#define DMA_CONF_DEBURST 0 +#define DMA_CONF_SERIALIZE 0 + +#define CORE_SPACE 2048 + +#if VERBOSE + #define PRINTF(...) printf(__VA_ARGS__) +#else + #define PRINTF(...) +#endif + +#ifdef QUICK_MODE + #define TRANSFERS 12 +#else + #define TRANSFERS NB_TRANSFERS +#endif + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // _DMA_FRONTEND_REG_DEFS_ diff --git a/idma_tests/idma_multi_core/idma_parameters.h b/idma_tests/idma_multi_core/idma_parameters.h new file mode 100644 index 0000000..0388301 --- /dev/null +++ b/idma_tests/idma_multi_core/idma_parameters.h @@ -0,0 +1,10 @@ +unsigned int sizes[] = { +89, +947, +156, +560, +278, +869, +238, +}; + diff --git a/idma_tests/idma_multi_core/idma_presets.h b/idma_tests/idma_multi_core/idma_presets.h new file mode 100644 index 0000000..3779763 --- /dev/null +++ b/idma_tests/idma_multi_core/idma_presets.h @@ -0,0 +1,3 @@ +unsigned int idma_presets [] = { + 1, 2, 3, 4, 8, 16, 32, 64, 128, 256, 512, 1024 +}; \ No newline at end of file diff --git a/idma_tests/idma_multi_core_2d/Makefile b/idma_tests/idma_multi_core_2d/Makefile new file mode 100644 index 0000000..1d6678e --- /dev/null +++ b/idma_tests/idma_multi_core_2d/Makefile @@ -0,0 +1,27 @@ +PULP_APP = test +TEST_SRCS ?= idma_multi_core_2d.c +PULP_APP_SRCS = $(TEST_SRCS) + +ifdef VERBOSE +PULP_CFLAGS += -DVERBOSE +endif + +ifdef MULTI_CORE_P +PULP_CFLAGS += -DMULTI_CORE_P +endif + +ifdef MULTI_CORE_S +PULP_CFLAGS += -DMULTI_CORE_S +endif + +ifdef QUICK_MODE +PULP_CFLAGS += -DQUICK_MODE +endif + +PULP_CFLAGS += -O3 +stackSize = 4096 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk + +stimuli: + python gen_stimuli_idma_2d.py \ No newline at end of file diff --git a/idma_tests/idma_multi_core_2d/gen_stimuli_idma_2d.py b/idma_tests/idma_multi_core_2d/gen_stimuli_idma_2d.py new file mode 100644 index 0000000..87b7bcb --- /dev/null +++ b/idma_tests/idma_multi_core_2d/gen_stimuli_idma_2d.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +import sys +import random +import argparse +import math +import re + + +parser = argparse.ArgumentParser(description='Generate stimuli for iDMA simple transfer') + +args = parser.parse_args() + +def write_transfer_parameters_struct(f, name): + f.write ('typedef struct {\n') + f.write (' unsigned int size;\n') + f.write (' unsigned int length;\n') + f.write (' unsigned int src_stride;\n') + f.write (' unsigned int dst_stride;\n') + f.write ('} %s;\n\n' % name) + return + +def write_transfer_parameters_array(f, name, arr): + f.write ('TransferParameters %s[] = {\n' % name) + for v in arr: + length = random.randint(1, MAX_LENGTH) + size = random.randint(1, TRANSFER_SIZE) + length + src_stride = random.randint(1, MAX_STRIDE) + length + dst_stride = random.randint(1, MAX_STRIDE) + length + f.write('{%d, %d, %d, %d},\n' % (size, length, src_stride, dst_stride)) + f.write('};\n\n') + return + +def write_define(f, name,val): + f.write('#define %s %d\n\n' % (name,val)) + return + +# Randomize between 1 and 10 the number of 2D iDMA transfers to be performed +NB_TRANSFERS = random.randint(1, 10) + +# Randomize between 1 and 128 the size in bytes of each transfer +TRANSFER_SIZE = 128 +MAX_STRIDE = 10 +MAX_LENGTH = 10 + +transfer_params = [None] * NB_TRANSFERS + +f_params = open('idma_parameters.h', 'w') +f_defines = open('idma_defines.h', 'w') + +write_define(f_defines, 'NB_TRANSFERS', NB_TRANSFERS) +write_transfer_parameters_struct(f_params, 'TransferParameters') +write_transfer_parameters_array(f_params, 'transfer_params', transfer_params) + +f_params.close() +f_defines.close() \ No newline at end of file diff --git a/idma_tests/idma_multi_core_2d/idma_defines.h b/idma_tests/idma_multi_core_2d/idma_defines.h new file mode 100644 index 0000000..618d62d --- /dev/null +++ b/idma_tests/idma_multi_core_2d/idma_defines.h @@ -0,0 +1,2 @@ +#define NB_TRANSFERS 8 + diff --git a/idma_tests/idma_multi_core_2d/idma_multi_core_2d.c b/idma_tests/idma_multi_core_2d/idma_multi_core_2d.c new file mode 100644 index 0000000..f5dc0eb --- /dev/null +++ b/idma_tests/idma_multi_core_2d/idma_multi_core_2d.c @@ -0,0 +1,209 @@ +#include "idma_multi_core_2d.h" + +#define TOT_SIZE 8 * CORE_SPACE + +int errors[8] = {0}; +int test_status = 8; + +uint32_t l1_addr[8] = {0}; +uint32_t l1_dst_addr[8] = {0}; +uint32_t l2_addr[8] = {0}; + +void print_transfer (TransferParameters transfer) { + if (rt_core_id() == 0) { + PRINTF ("Transfer Parameters: \n"); + PRINTF ("Size: %d | Length: %d \n", transfer.size, transfer.length); + PRINTF ("Src_stride_2d: %d | Dst_stride_2d: %d \n", transfer.src_stride, transfer.dst_stride); + } +} + +int test_idma_2D (int core_id, TransferParameters transfer, int ext2loc, int loc2loc) { + volatile uint8_t *src_ptr, *dst_ptr; + + int error = 0; + int offset_2d; + + uint32_t src_stride = transfer.src_stride; + uint32_t dst_stride = transfer.dst_stride; + uint32_t size = transfer.size; + uint32_t length = transfer.length; + uint32_t num_reps = size/length; + + if (loc2loc == 1) { + // L1 to L1 transfer + src_ptr = (uint8_t*) l1_addr[core_id]; + dst_ptr = (uint8_t*) l1_dst_addr[core_id]; + } else if (ext2loc == 1) { + // L2 to L1 transfer + src_ptr = (uint8_t*) l2_addr[core_id]; + dst_ptr = (uint8_t*) l1_addr[core_id]; + } else { + // L1 to L2 transfer + src_ptr = (uint8_t*) l1_addr[core_id]; + dst_ptr = (uint8_t*) l2_addr[core_id]; + } + + // Fill source region with test data + for (int q = 0; q < num_reps; q++) { + offset_2d = q * src_stride; + for (int i = 0; i < length; i++) { + src_ptr[i+offset_2d] = (uint8_t)(i & 0xFF); + } + } + + if (loc2loc == 1) { + plp_cl_dma_wait_toL1(pulp_cl_idma_L1ToL1_2d((unsigned int)src_ptr, (unsigned int)dst_ptr, length, src_stride, dst_stride, num_reps)); + } if (ext2loc == 1) { + plp_cl_dma_wait_toL1(pulp_cl_idma_L2ToL1_2d((unsigned int)src_ptr, (unsigned int)dst_ptr, length, src_stride, dst_stride, num_reps)); + } else { + plp_cl_dma_wait_toL2(pulp_cl_idma_L1ToL2_2d((unsigned int)src_ptr, (unsigned int)dst_ptr, length, src_stride, dst_stride, num_reps)); + } + + // Check the results + + for (unsigned int rep = 0; rep < num_reps; rep++) { + unsigned int src_offset = rep * src_stride; + unsigned int dst_offset = rep * dst_stride; + for (unsigned int i = 0; i < length; i++) { + uint8_t expected = src_ptr[src_offset + i]; + uint8_t actual = dst_ptr[dst_offset + i]; + + if (expected != actual) { + error++; + if (core_id == 0) { + PRINTF ("ERROR: expected[%d] @%8x = %8x vs actual[%d] @%8x = %8x \n", src_offset + i, &src_ptr[src_offset + i], + expected, dst_offset+i, &dst_ptr[dst_offset + i], actual); + } + } + + } + } + + return error; +} + +void allocate_mem_to_cores () { + int core_id = rt_core_id(); + + // Pre-allocate TOT_SIZE = 8 * CORE_SPACE: then we split this window to assign + // each core its available space for iDMA transfers + // pi_l1_malloc starts allocating from 0x10004008 in L1 + // pi_l2_malloc starts allocating from 0x1c000a60 in L2 + + if (core_id == 0) { + l1_addr[0] = (uint32_t) pi_l1_malloc(0, TOT_SIZE); + l1_dst_addr[0] = (uint32_t) pi_l1_malloc(0, TOT_SIZE); + l2_addr[0] = (uint32_t) pi_l2_malloc(TOT_SIZE); + } + + // The following synch_barrier is needed so that + // no core can assign its address range until the mallocs are executed + synch_barrier(); + + l1_addr[core_id] = l1_addr[0] + core_id * CORE_SPACE; + l1_dst_addr[core_id] = l1_dst_addr[0] + core_id * CORE_SPACE; + l2_addr[core_id] = l2_addr[0] + core_id * CORE_SPACE; + + // The following synch_barrier is needed so that + // no core can start executing until all address ranges have been assigned + synch_barrier(); + +} + +void free_allocated_memory () { + + synch_barrier(); + // Only Core 0 takes care of freeing the allocated memory, since it's the one + // allocated it at the beginning of the test + if (core_id == 0) { + pi_l1_free(0, l1_addr, TOT_SIZE); + pi_l1_free(0, l1_dst_addr, TOT_SIZE); + pi_l2_free(l2_addr, TOT_SIZE); + } +} + +int main () { + + int core_id = rt_core_id(); + + allocate_mem_to_cores(); + + TransferParameters transfer; + + #ifdef MULTI_CORE_P + // MULTI CORE PARALLEL MODE: each core uses the iDMA in a parallel manner + if (core_id == 0) { + PRINTF ("MULTI CORE PARALLEL MODE \n"); + } + for (int k = 0; k < TRANSFERS; k++) { + #ifdef QUICK_MODE + transfer = idma_presets[k]; + #else + transfer = transfer_params[k]; + #endif + print_transfer(transfer); + // L1 -> L2 + errors[core_id] += test_idma_2D(core_id, transfer, 0, 0); + // L2 -> L1 + errors[core_id] += test_idma_2D(core_id, transfer, 1, 0); + // L1 -> L1 + errors[core_id] += test_idma_2D(core_id, transfer, 0, 1); + } + synch_barrier(); + #elif MULTI_CORE_S + // MULTI CORE SERIAL MODE: each core uses the iDMA in a serial manner + if (core_id == 0) { + PRINTF ("MULTI CORE SERIAL MODE \n"); + } + for (int i = 0; i < 8; i++) { + if (core_id == i) { + for (int k = 0; k < TRANSFERS; k++) { + #ifdef QUICK_MODE + transfer = idma_presets[k]; + #else + transfer = transfer_params[k]; + #endif + print_transfer(transfer); + // L1 -> L2 + errors[core_id] += test_idma_2D(core_id, transfer, 0, 0); + // L2 -> L1 + errors[core_id] += test_idma_2D(core_id, transfer, 1, 0); + // L1 -> L1 + errors[core_id] += test_idma_2D(core_id, transfer, 0, 1); + } + } + } + #else + // SINGLE CORE MODE: just core 0 uses the iDMA + if (core_id == 0) { + PRINTF ("SINGLE CORE MODE \n"); + for (int k = 0; k < TRANSFERS; k++) { + #ifdef QUICK_MODE + transfer = idma_presets[k]; + #else + transfer = transfer_params[k]; + #endif + print_transfer(transfer); + // L1 -> L2 + errors[core_id] += test_idma_2D(core_id, transfer, 0, 0); + // L2 -> L1 + errors[core_id] += test_idma_2D(core_id, transfer, 1, 0); + // L1 -> L1 + errors[core_id] += test_idma_2D(core_id, transfer, 0, 1); + } + } + #endif + + if (core_id == 0) { + for (int i = 0; i<8; i++) { + if (errors[i] == 0) { + PRINTF ("Core %d returned %d errors \n", i, errors[i]); + test_status --; + } + } + } + + free_allocated_memory(); + + return test_status; +} \ No newline at end of file diff --git a/idma_tests/idma_multi_core_2d/idma_multi_core_2d.h b/idma_tests/idma_multi_core_2d/idma_multi_core_2d.h new file mode 100644 index 0000000..87d9047 --- /dev/null +++ b/idma_tests/idma_multi_core_2d/idma_multi_core_2d.h @@ -0,0 +1,48 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +//Author: Andreas Kuster +// +//Description: Generated register defines for dma_frontend + +#include "pulp.h" +#include +#include +#include +#include +#include + +#include "idma_defines.h" +#include "idma_parameters.h" +#include "idma_presets.h" + +#ifndef _DMA_FRONTEND_REG_DEFS_ +#define _DMA_FRONTEND_REG_DEFS_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DMA_CONF_DECOUPLE 0 +#define DMA_CONF_DEBURST 0 +#define DMA_CONF_SERIALIZE 0 + +#define CORE_SPACE 2048 + +#ifdef QUICK_MODE +#define TRANSFERS 7 +#else +#define TRANSFERS NB_TRANSFERS +#endif + +#ifdef VERBOSE + #define PRINTF(...) printf(__VA_ARGS__) +#else + #define PRINTF(...) +#endif + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // _DMA_FRONTEND_REG_DEFS_ diff --git a/idma_tests/idma_multi_core_2d/idma_parameters.h b/idma_tests/idma_multi_core_2d/idma_parameters.h new file mode 100644 index 0000000..bc02dbc --- /dev/null +++ b/idma_tests/idma_multi_core_2d/idma_parameters.h @@ -0,0 +1,18 @@ +typedef struct { + unsigned int size; + unsigned int length; + unsigned int src_stride; + unsigned int dst_stride; +} TransferParameters; + +TransferParameters transfer_params[] = { +{5, 1, 10, 8}, +{64, 6, 12, 8}, +{46, 5, 11, 7}, +{66, 3, 5, 9}, +{54, 7, 13, 10}, +{84, 8, 18, 15}, +{106, 2, 12, 11}, +{65, 9, 12, 12}, +}; + diff --git a/idma_tests/idma_multi_core_2d/idma_presets.h b/idma_tests/idma_multi_core_2d/idma_presets.h new file mode 100644 index 0000000..edc740a --- /dev/null +++ b/idma_tests/idma_multi_core_2d/idma_presets.h @@ -0,0 +1,11 @@ + +TransferParameters idma_presets[] = { +{1, 1, 1, 1}, +{2, 2, 512, 512}, +{4, 2, 256, 256}, +{16, 8, 64, 64}, +{64, 8, 16, 16}, +{128, 8, 8, 8}, +{512, 2, 4, 4}, +}; + diff --git a/idma_tests/idma_multi_core_3d/Makefile b/idma_tests/idma_multi_core_3d/Makefile new file mode 100644 index 0000000..e206393 --- /dev/null +++ b/idma_tests/idma_multi_core_3d/Makefile @@ -0,0 +1,27 @@ +PULP_APP = test +TEST_SRCS ?= idma_multi_core_3d.c +PULP_APP_SRCS = $(TEST_SRCS) + +ifdef VERBOSE +PULP_CFLAGS += -DVERBOSE +endif + +ifdef MULTI_CORE_P +PULP_CFLAGS += -DMULTI_CORE_P +endif + +ifdef MULTI_CORE_S +PULP_CFLAGS += -DMULTI_CORE_S +endif + +ifdef QUICK_MODE +PULP_CFLAGS += -DQUICK_MODE +endif + +PULP_CFLAGS += -O3 +stackSize = 4096 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk + +stimuli: + python gen_stimuli_idma_3d.py \ No newline at end of file diff --git a/idma_tests/idma_multi_core_3d/gen_stimuli_idma_3d.py b/idma_tests/idma_multi_core_3d/gen_stimuli_idma_3d.py new file mode 100644 index 0000000..75469b9 --- /dev/null +++ b/idma_tests/idma_multi_core_3d/gen_stimuli_idma_3d.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +import sys +import random +import argparse +import math +import re + + +parser = argparse.ArgumentParser(description='Generate stimuli for iDMA simple transfer') + +args = parser.parse_args() + +def write_transfer_parameters_struct(f, name): + f.write ('typedef struct {\n') + f.write (' unsigned int size;\n') + f.write (' unsigned int length;\n') + f.write (' unsigned int src_stride_2d;\n') + f.write (' unsigned int dst_stride_2d;\n') + f.write (' unsigned int src_stride_3d;\n') + f.write (' unsigned int dst_stride_3d;\n') + f.write (' unsigned int num_reps_3d;\n') + f.write ('} %s;\n\n' % name) + return + +def write_transfer_parameters_array(f, name, arr): + f.write ('TransferParameters %s[] = {\n' % name) + for v in arr: + length = random.randint(1, MAX_LENGTH) + size = random.randint(1, TRANSFER_SIZE) + length + src_stride_2d = random.randint(1, MAX_STRIDE) + length + dst_stride_2d = random.randint(1, MAX_STRIDE) + length + src_stride_3d = random.randint(1, MAX_STRIDE) + length + dst_stride_3d = random.randint(1, MAX_STRIDE) + length + num_reps_3d = random.randint(1, MAX_REPS) + + f.write('{%d, %d, %d, %d, %d, %d, %d},\n' % (size, length, src_stride_2d, dst_stride_2d, src_stride_3d, dst_stride_3d, num_reps_3d)) + f.write('};\n\n') + return + +def write_define(f, name,val): + f.write('#define %s %d\n\n' % (name,val)) + return + +# Randomize between 1 and 10 the number of 3D iDMA transfers to be performed +# For each transfer we set a size in bytes. + +NB_TRANSFERS = random.randint(1, 10) + +# Randomize between 1 and 128 the size of each transfer +TRANSFER_SIZE = 128 +MAX_STRIDE = 10 +MAX_LENGTH = 10 +MAX_REPS = 5 + +transfer_params = [None] * NB_TRANSFERS + +f_params = open('idma_parameters.h', 'w') +f_defines = open('idma_defines.h', 'w') + +write_define(f_defines, 'NB_TRANSFERS', NB_TRANSFERS) +write_transfer_parameters_struct(f_params, 'TransferParameters') +write_transfer_parameters_array(f_params, 'transfer_params', transfer_params) + +f_params.close() +f_defines.close() \ No newline at end of file diff --git a/idma_tests/idma_multi_core_3d/idma_defines.h b/idma_tests/idma_multi_core_3d/idma_defines.h new file mode 100644 index 0000000..05d5bda --- /dev/null +++ b/idma_tests/idma_multi_core_3d/idma_defines.h @@ -0,0 +1,2 @@ +#define NB_TRANSFERS 10 + diff --git a/idma_tests/idma_multi_core_3d/idma_multi_core_3d.c b/idma_tests/idma_multi_core_3d/idma_multi_core_3d.c new file mode 100644 index 0000000..b8479ff --- /dev/null +++ b/idma_tests/idma_multi_core_3d/idma_multi_core_3d.c @@ -0,0 +1,230 @@ +#include "idma_multi_core_3d.h" + +#define TOT_SIZE 8 * CORE_SPACE + +int errors[8] = {0}; +int test_status = 8; + +uint32_t l1_addr[8] = {0}; +uint32_t l1_dst_addr[8] = {0}; +uint32_t l2_addr[8] = {0}; + +void print_transfer (TransferParameters transfer) { + if (rt_core_id() == 0) { + PRINTF ("Transfer Parameters: \n"); + PRINTF ("Size: %d | Length: %d \n", transfer.size, transfer.length); + PRINTF ("Src_stride_2d: %d | Dst_stride_2d: %d \n", transfer.src_stride_2d, transfer.dst_stride_2d); + PRINTF ("Src_stride_3d: %d | Dst_stride_3d: %d \n", transfer.src_stride_3d, transfer.dst_stride_3d); + PRINTF ("Num_reps_3d: %d \n", transfer.num_reps_3d); + } +} + +int test_idma_3D (int core_id, TransferParameters transfer, int ext2loc, int loc2loc) { + volatile uint8_t *src_ptr, *dst_ptr; + unsigned int offset_3d = 0; + unsigned int offset_2d = 0; + + int error = 0; + + uint32_t src_stride_2d = transfer.src_stride_2d; + uint32_t dst_stride_2d = transfer.dst_stride_2d; + uint32_t src_stride_3d = transfer.src_stride_3d; + uint32_t dst_stride_3d = transfer.dst_stride_3d; + uint32_t size = transfer.size; + uint32_t length = transfer.length; + uint32_t num_reps = size/length; + uint32_t num_reps_3d = transfer.num_reps_3d; + + if (loc2loc == 1) { + // L1 to L1 transfer + src_ptr = (uint8_t*) l1_addr[core_id]; + dst_ptr = (uint8_t*) l1_dst_addr[core_id]; + } else if (ext2loc == 1) { + // L2 to L1 transfer + src_ptr = (uint8_t*) l2_addr[core_id]; + dst_ptr = (uint8_t*) l1_addr[core_id]; + } else { + // L1 to L2 transfer + src_ptr = (uint8_t*) l1_addr[core_id]; + dst_ptr = (uint8_t*) l2_addr[core_id]; + } + + // Fill source region with test data + for (int j = 0; j < num_reps_3d; j++) { + for (int q = 0; q < num_reps; q++) { + for (int i = 0; i < length; i++) { + src_ptr[i+offset_2d+offset_3d] = (uint8_t)(i & 0xFF); + } + offset_2d += src_stride_2d; + } + offset_2d = 0; + offset_3d += (num_reps-1) * src_stride_2d + src_stride_3d; + } + + + if (loc2loc == 1) { + plp_cl_dma_wait_toL1(pulp_cl_idma_L1ToL1_3d((unsigned int)src_ptr, (unsigned int)dst_ptr, length, src_stride_2d, dst_stride_2d, num_reps, + src_stride_3d, dst_stride_3d, num_reps_3d)); + } else if (ext2loc == 1) { + plp_cl_dma_wait_toL1(pulp_cl_idma_L2ToL1_3d((unsigned int)src_ptr, (unsigned int)dst_ptr, length, src_stride_2d, dst_stride_2d, num_reps, + src_stride_3d, dst_stride_3d, num_reps_3d)); + } else { + plp_cl_dma_wait_toL2(pulp_cl_idma_L1ToL2_3d((unsigned int)src_ptr, (unsigned int)dst_ptr, length, src_stride_2d, dst_stride_2d, num_reps, + src_stride_3d, dst_stride_3d, num_reps_3d)); + } + + // Check the results + int src_offset_2d = 0; + int dst_offset_2d = 0; + int src_offset_3d = 0; + int dst_offset_3d = 0; + + for (int rep_3d = 0; rep_3d < num_reps_3d; rep_3d ++) { + for (unsigned int rep = 0; rep < num_reps; rep++) { + for (unsigned int i = 0; i < length; i++) { + uint8_t expected = src_ptr[src_offset_2d + src_offset_3d + i]; + uint8_t actual = dst_ptr[dst_offset_2d + dst_offset_3d + i]; + + if (expected != actual) { + if (core_id == 0) { + PRINTF ("ERROR: expected @%8x[%d] = %8x vs actual @%8x[%d] = %8x \n", &src_ptr[src_offset_2d + src_offset_3d + i], src_offset_2d + src_offset_3d + i, + expected, &dst_ptr[dst_offset_2d + dst_offset_3d + i], dst_offset_2d + dst_offset_3d + i, actual); + } + error++; + } + } + src_offset_2d += src_stride_2d; + dst_offset_2d += dst_stride_2d; + } + src_offset_2d = 0; + dst_offset_2d = 0; + src_offset_3d += (num_reps-1) * src_stride_2d + src_stride_3d; + dst_offset_3d += (num_reps-1) * dst_stride_2d + dst_stride_3d; + } + + return error; +} + +void allocate_mem_to_cores () { + int core_id = rt_core_id(); + + // Pre-allocate TOT_SIZE = 8 * CORE_SPACE: then we split this window to assign + // each core its available space for iDMA transfers + // pi_l1_malloc starts allocating from 0x10004008 in L1 + // pi_l2_malloc starts allocating from 0x1c000a60 in L2 + + if (core_id == 0) { + l1_addr[0] = (uint32_t) pi_l1_malloc(0, TOT_SIZE); + l1_dst_addr[0] = (uint32_t) pi_l1_malloc(0, TOT_SIZE); + l2_addr[0] = (uint32_t) pi_l2_malloc(TOT_SIZE); + } + + // The following synch_barrier is needed so that + // no core can assign its address range until the mallocs are executed + synch_barrier(); + + l1_addr[core_id] = l1_addr[0] + core_id * CORE_SPACE; + l1_dst_addr[core_id] = l1_dst_addr[0] + core_id * CORE_SPACE; + l2_addr[core_id] = l2_addr[0] + core_id * CORE_SPACE; + + // The following synch_barrier is needed so that + // no core can start executing until all address ranges have been assigned + synch_barrier(); + +} + +void free_allocated_memory () { + + synch_barrier(); + // Only Core 0 takes care of freeing the allocated memory, since it's the one + // allocated it at the beginning of the test + if (core_id == 0) { + pi_l1_free(0, l1_addr, TOT_SIZE); + pi_l1_free(0, l1_dst_addr, TOT_SIZE); + pi_l2_free(l2_addr, TOT_SIZE); + } +} + +int main () { + + int core_id = rt_core_id(); + + allocate_mem_to_cores(); + + TransferParameters transfer; + + #ifdef MULTI_CORE_P + // MULTI CORE PARALLEL MODE: each core uses the iDMA in a parallel manner + if (core_id == 0) { + PRINTF ("MULTI CORE PARALLEL MODE \n"); + } + for (int k = 0; k < TRANSFERS; k++) { + #ifdef QUICK_MODE + transfer = idma_presets[k]; + #else + transfer = transfer_params[k]; + #endif + print_transfer(transfer); + // L1 -> L2 + errors[core_id] += test_idma_3D(core_id, transfer, 0, 0); + // L2 -> L1 + errors[core_id] += test_idma_3D(core_id, transfer, 1, 0); + // L1 -> L1 transfer + errors[core_id] += test_idma_3D(core_id, transfer, 0, 1); + } + synch_barrier(); + #elif MULTI_CORE_S + // MULTI CORE SERIAL MODE: each core uses the iDMA in a serial manner + if (core_id == 0) { + PRINTF ("MULTI CORE SERIAL MODE \n"); + } + for (int i = 0; i < 8; i++) { + if (core_id == i) { + for (int k = 0; k < TRANSFERS; k++) { + #ifdef QUICK_MODE + transfer = idma_presets[k]; + #else + transfer = transfer_params[k]; + #endif + print_transfer(transfer); + // L1 -> L2 + errors[core_id] += test_idma_3D(core_id, transfer, 0, 0); + // L2 -> L1 + errors[core_id] += test_idma_3D(core_id, transfer, 1, 0); + // L1 -> L1 transfer + errors[core_id] += test_idma_3D(core_id, transfer, 0, 1); + } + } + } + #else + if (core_id == 0) { + for (int k = 0; k < TRANSFERS; k++) { + #ifdef QUICK_MODE + transfer = idma_presets[k]; + #else + transfer = transfer_params[k]; + #endif + print_transfer(transfer); + PRINTF ("L1 to L2 \n"); + errors[core_id] += test_idma_3D(core_id, transfer, 0, 0); + PRINTF ("L2 to L1 \n"); + errors[core_id] += test_idma_3D(core_id, transfer, 1, 0); + PRINTF ("L1 to L1 \n"); + errors[core_id] += test_idma_3D(core_id, transfer, 0, 1); + } + } + #endif + + if (core_id == 0) { + for (int i = 0; i<8; i++) { + if (errors[i] == 0) { + PRINTF ("Core %d returned %d errors \n", i, errors[i]); + test_status--; + } + } + } + + free_allocated_memory(); + + return test_status; +} \ No newline at end of file diff --git a/idma_tests/idma_multi_core_3d/idma_multi_core_3d.h b/idma_tests/idma_multi_core_3d/idma_multi_core_3d.h new file mode 100644 index 0000000..a6547ba --- /dev/null +++ b/idma_tests/idma_multi_core_3d/idma_multi_core_3d.h @@ -0,0 +1,48 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +//Author: Andreas Kuster +// +//Description: Generated register defines for dma_frontend + +#include "pulp.h" +#include +#include +#include +#include +#include + +#include "idma_defines.h" +#include "idma_parameters.h" +#include "idma_presets.h" + +#ifndef _DMA_FRONTEND_REG_DEFS_ +#define _DMA_FRONTEND_REG_DEFS_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DMA_CONF_DECOUPLE 0 +#define DMA_CONF_DEBURST 0 +#define DMA_CONF_SERIALIZE 0 + +#define CORE_SPACE 2048 + +#ifdef QUICK_MODE +#define TRANSFERS 9 +#else +#define TRANSFERS NB_TRANSFERS +#endif + +#ifdef VERBOSE + #define PRINTF(...) printf(__VA_ARGS__) +#else + #define PRINTF(...) +#endif + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // _DMA_FRONTEND_REG_DEFS_ diff --git a/idma_tests/idma_multi_core_3d/idma_parameters.h b/idma_tests/idma_multi_core_3d/idma_parameters.h new file mode 100644 index 0000000..77c06db --- /dev/null +++ b/idma_tests/idma_multi_core_3d/idma_parameters.h @@ -0,0 +1,23 @@ +typedef struct { + unsigned int size; + unsigned int length; + unsigned int src_stride_2d; + unsigned int dst_stride_2d; + unsigned int src_stride_3d; + unsigned int dst_stride_3d; + unsigned int num_reps_3d; +} TransferParameters; + +TransferParameters transfer_params[] = { +{107, 5, 12, 13, 7, 12, 1}, +{86, 5, 7, 10, 6, 12, 4}, +{95, 10, 12, 20, 15, 17, 4}, +{40, 2, 4, 12, 5, 4, 3}, +{105, 4, 14, 6, 13, 9, 3}, +{72, 8, 11, 17, 13, 15, 2}, +{54, 10, 14, 20, 13, 11, 3}, +{74, 6, 15, 10, 10, 11, 1}, +{114, 4, 9, 8, 12, 9, 3}, +{92, 3, 13, 5, 8, 5, 1}, +}; + diff --git a/idma_tests/idma_multi_core_3d/idma_presets.h b/idma_tests/idma_multi_core_3d/idma_presets.h new file mode 100644 index 0000000..82ce6d6 --- /dev/null +++ b/idma_tests/idma_multi_core_3d/idma_presets.h @@ -0,0 +1,16 @@ + +// Parameters are declared in this order: +// size, length, src_stride_2d, dst_stride_2d, src_stride_3d, dst_stride_3d, num_reps_3d + +TransferParameters idma_presets[] = { +{1, 1, 1, 1, 1, 1, 1}, +{2, 8, 8, 8, 8, 8, 2}, +{3, 8, 8, 8, 8, 8, 4}, +{4, 8, 16, 16, 16, 16, 8}, +{8, 8, 32, 32, 32, 32, 8}, +{16, 8, 16, 16, 16, 16, 8}, +{32, 2, 2, 2, 2, 2, 2}, +{64, 4, 4, 4, 4, 4, 2}, +{128, 4, 4, 4, 4, 4, 4}, +}; + diff --git a/idma_tests/idma_simple/Makefile b/idma_tests/idma_simple/Makefile new file mode 100755 index 0000000..15ad295 --- /dev/null +++ b/idma_tests/idma_simple/Makefile @@ -0,0 +1,22 @@ +PULP_APP = test + +TEST_SRCS ?= simple_tx.c +PULP_APP_SRCS = $(TEST_SRCS) +PULP_APP_FC_SRCS = $(TEST_FC_SRCS) +ifdef TEST_FC_SRCS +pulpFc=1 +endif + +space := +space += + +#BUILD_DIR = $(subst $(space),_,$(CURDIR)/build/$(TEST_SRCS)) + +ifdef VERBOSE +PULP_CFLAGS += -DVERBOSE +endif + +PULP_CFLAGS += -O3 +stackSize = 4096 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/idma_tests/idma_simple/dma_wave.do b/idma_tests/idma_simple/dma_wave.do new file mode 100644 index 0000000..df25845 --- /dev/null +++ b/idma_tests/idma_simple/dma_wave.do @@ -0,0 +1,419 @@ +onerror {resume} +quietly WaveActivateNextPane {} 0 +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/rst_ni +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/test_mode_i +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/ext_master_req_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/ext_master_resp_i +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/term_event_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/term_irq_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/term_event_pe_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/term_irq_pe_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/busy_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_wdata +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_add +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_wen +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_be +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_id +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_gnt +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_r_rdata +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_r_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_r_opc +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_r_id +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_req_from_dma +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_req_from_rrc +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_reorg_req_from_dma +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_reorg_req_from_rrc +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_req_from_dma +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_req_from_rrc +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_req_muxed +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_rsp_to_dma +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_rsp_to_rrc +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_reorg_rsp_to_dma +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_reorg_rsp_to_rrc +add wave -noupdate -expand -group {iDMA wrap} -subitemconfig {{/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_rsp_to_dma[0]} -expand} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_rsp_to_dma +add wave -noupdate -expand -group {iDMA wrap} -subitemconfig {{/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_rsp_to_rrc[0]} -expand} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_rsp_to_rrc +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_rsp_to_mux +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/soc_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/soc_rsp +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/dma_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/dma_rsp +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/dma_regs_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/dma_regs_rsp +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/stream_idx +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/twod_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/twod_req_queue +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/idma_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/idma_rsp +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/one_fe_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/fe_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/twod_queue_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/be_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/be_rsp_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/fe_ready +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/twod_queue_ready +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/be_ready +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/be_rsp_ready +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/trans_complete +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/midend_busy +add wave -noupdate -expand -group {iDMA wrap} -expand /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/idma_busy +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/done_id +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/next_id +add wave -noupdate -group {init strm1} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/clk_i +add wave -noupdate -group {init strm1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_write_rsp} +add wave -noupdate -group {init strm1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_write_req} +add wave -noupdate -group {init strm1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_read_req.req_valid} +add wave -noupdate -group {init strm1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_read_rsp.req_ready} +add wave -noupdate -group {init strm1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_read_rsp.rsp_valid} +add wave -noupdate -group {init strm1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_read_req.rsp_ready} +add wave -noupdate -group {init strm1} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_read_rsp} +add wave -noupdate -group {init strm1} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_read_req} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/clk_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/rst_ni} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/testmode_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_read_req_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_read_rsp_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_read_req_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_read_rsp_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_read_req_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_read_rsp_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_write_req_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_write_rsp_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_write_req_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_write_rsp_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_req_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_valid_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_ready_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_rsp_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_valid_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_ready_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_req_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_valid_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_ready_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_valid_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_ready_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/ar_req_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/ar_valid_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/ar_ready_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/aw_req_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/aw_valid_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/aw_ready_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/dp_poison_i} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_chan_ready_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_chan_valid_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_busy_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_busy_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_busy_o} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_buffer_in_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_buffer_in_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_buffer_in_valid} +add wave -noupdate -group {to OBI transport layer} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_valid_shifted} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_buffer_out_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_buffer_out_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_ready_shifted} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_buffer_in} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_buffer_in} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_buffer_in} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in_shifted} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_shifted} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_chan_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_chan_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_chan_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_chan_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_chan_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_chan_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_dp_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_dp_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_dp_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_dp_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_dp_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_dp_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_dp_rsp} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_dp_rsp} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_dp_rsp} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_ar_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_ar_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_ar_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_rsp_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_rsp_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_rsp_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_rsp_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_rsp} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_rsp} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_aw_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_aw_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_req_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_req_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_mux_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_mux_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_mux} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_in_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_in_ready} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_out_protocol} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_out_valid} +add wave -noupdate -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_out_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/rst_ni} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/testmode_i} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/idma_req_i} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/req_valid_i} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/req_ready_o} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/idma_rsp_o} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/rsp_valid_o} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/rsp_ready_i} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/idma_eh_req_i} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/eh_req_valid_i} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/eh_req_ready_o} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/axi_read_req_o} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/axi_read_rsp_i} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/init_read_req_o} +add wave -noupdate -group {backend 1} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/init_read_rsp_i} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/obi_read_req_o} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/obi_read_rsp_i} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/init_write_req_o} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/init_write_rsp_i} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/obi_write_req_o} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/obi_write_rsp_i} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/busy_o} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/dp_busy} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/dp_poison} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_req} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_req} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_valid} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_valid} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_meta_req_tagged} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_meta_req_tagged} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_last_burst} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_last_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_super_last} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_dp_req_in_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_dp_req_in_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_dp_req_out_valid} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_dp_req_out_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_dp_req_out} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_dp_req_out_valid} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_dp_req_out_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_dp_req_out} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_dp_rsp} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_dp_rsp} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_dp_rsp_valid} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_dp_rsp_valid} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_dp_rsp_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/w_dp_rsp_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/ar_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/ar_ready_dp} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/aw_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/aw_ready_dp} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/aw_valid_dp} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/ar_valid_dp} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/aw_req_dp} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/ar_req_dp} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/legalizer_flush} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/legalizer_kill} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/is_length_zero} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/req_valid} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/idma_rsp} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/rsp_valid} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/rsp_ready} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_chan_valid} +add wave -noupdate -group {backend 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/r_chan_ready} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/clk_i} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/rst_ni} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/req_i} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/valid_i} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/ready_o} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_req_o} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_valid_o} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_ready_i} +add wave -noupdate -group {legalizer 1} -expand -subitemconfig {{/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_req_o.w_dp_req} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_req_o.aw_req} -expand} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_req_o} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_valid_o} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_ready_i} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/flush_i} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/kill_i} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_busy_o} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_busy_o} +add wave -noupdate -group {legalizer 1} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_tf_d} +add wave -noupdate -group {legalizer 1} -childformat {{{/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_tf_q.length} -radix unsigned}} -subitemconfig {{/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_tf_q.length} {-height 16 -radix unsigned}} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_tf_q} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_tf_d} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_tf_q} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/opt_tf_d} +add wave -noupdate -group {legalizer 1} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/opt_tf_q} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_tf_ena} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_tf_ena} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_page_num_bytes_to_pb} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_num_bytes_to_pb} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_page_num_bytes_to_pb} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_num_bytes_to_pb} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/c_num_bytes_to_pb} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_num_bytes_possible} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_num_bytes} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_addr_offset} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/r_done} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_num_bytes_possible} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_num_bytes} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_addr_offset} +add wave -noupdate -group {legalizer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/w_done} +add wave -noupdate -group {rd page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_read_page_splitter/addr_i} +add wave -noupdate -group {rd page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_read_page_splitter/not_bursting_i} +add wave -noupdate -group {rd page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_read_page_splitter/reduce_len_i} +add wave -noupdate -group {rd page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_read_page_splitter/max_llen_i} +add wave -noupdate -group {rd page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_read_page_splitter/num_bytes_to_pb_o} +add wave -noupdate -group {rd page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_read_page_splitter/page_addr_width} +add wave -noupdate -group {rd page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_read_page_splitter/page_size} +add wave -noupdate -group {rd page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_read_page_splitter/page_offset} +add wave -noupdate -group {wr page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_write_page_splitter/addr_i} +add wave -noupdate -group {wr page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_write_page_splitter/not_bursting_i} +add wave -noupdate -group {wr page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_write_page_splitter/reduce_len_i} +add wave -noupdate -group {wr page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_write_page_splitter/max_llen_i} +add wave -noupdate -group {wr page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_write_page_splitter/num_bytes_to_pb_o} +add wave -noupdate -group {wr page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_write_page_splitter/page_addr_width} +add wave -noupdate -group {wr page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_write_page_splitter/page_size} +add wave -noupdate -group {wr page splitter 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/gen_hw_legalizer/i_idma_legalizer/i_write_page_splitter/page_offset} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/clk_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/rst_ni} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/testmode_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_read_req_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_read_rsp_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_read_req_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_read_rsp_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_read_req_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_read_rsp_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_write_req_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_write_rsp_i} +add wave -noupdate -group {transport layer 1} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_write_req_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_write_rsp_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_req_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_valid_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_ready_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_rsp_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_valid_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_ready_i} +add wave -noupdate -group {transport layer 1} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_req_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_valid_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_ready_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_valid_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_ready_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/ar_req_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/ar_valid_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/ar_ready_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/aw_req_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/aw_valid_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/aw_ready_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/dp_poison_i} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_chan_ready_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_chan_valid_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_busy_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_busy_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_busy_o} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_buffer_in_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_buffer_in_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_buffer_in_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_valid_shifted} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_buffer_out_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_buffer_out_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_ready_shifted} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_buffer_in} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_buffer_in} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_buffer_in} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in_shifted} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_shifted} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_chan_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_chan_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_chan_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_chan_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_chan_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_chan_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_dp_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_dp_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_dp_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_dp_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_dp_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_dp_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_dp_rsp} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_dp_rsp} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_dp_rsp} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_ar_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_ar_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_ar_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_rsp_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_rsp_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_rsp_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_rsp_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_rsp} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_rsp} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_aw_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_aw_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_req_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_req_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_mux_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_mux_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_mux} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_in_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_in_ready} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_out_protocol} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_out_valid} +add wave -noupdate -group {transport layer 1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_out_ready} +add wave -noupdate -group {strm1 init read} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/r_dp_req_i} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/r_dp_valid_i} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/r_dp_ready_o} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/r_dp_rsp_o} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/r_dp_valid_o} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/r_dp_ready_i} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/read_meta_req_i} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/read_meta_valid_i} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/read_meta_ready_o} +add wave -noupdate -group {strm1 init read} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/read_req_o} +add wave -noupdate -group {strm1 init read} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/read_rsp_i} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/r_chan_ready_o} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/r_chan_valid_o} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/buffer_in_o} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/buffer_in_valid_o} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/buffer_in_ready_i} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/read_aligned_in_mask} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/mask_in} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/in_valid} +add wave -noupdate -group {strm1 init read} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/i_idma_init_read/in_ready} +TreeUpdate [SetDefaultTree] +quietly WaveActivateNextPane +add wave -noupdate -color Gold {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/clk_i} +TreeUpdate [SetDefaultTree] +WaveRestoreCursors {{Cursor 1} {1574893420000 ps} 0} +quietly wave cursor active 1 +configure wave -namecolwidth 252 +configure wave -valuecolwidth 224 +configure wave -justifyvalue left +configure wave -signalnamewidth 1 +configure wave -snapdistance 10 +configure wave -datasetprefix 0 +configure wave -rowmargin 4 +configure wave -childrowmargin 2 +configure wave -gridoffset 0 +configure wave -gridperiod 1 +configure wave -griddelta 40 +configure wave -timeline 0 +configure wave -timelineunits ns +update +WaveRestoreZoom {248010100 ns} {3541692100 ns} diff --git a/idma_tests/idma_simple/simple_tx.c b/idma_tests/idma_simple/simple_tx.c new file mode 100644 index 0000000..5156078 --- /dev/null +++ b/idma_tests/idma_simple/simple_tx.c @@ -0,0 +1,133 @@ +#include +#include "pulp.h" +//#include "mchan_tests.h" + +#define VERBOSE + +#define MAX_BUFFER_SIZE 0x2000 + +//static unsigned char *ext; +//static unsigned char *loc; + +//#define EXT_DATA_ADDR ((unsigned int)ext) +//#define TCDM_DATA_ADDR ((unsigned int)loc) + +L2_DATA static unsigned char ext[MAX_BUFFER_SIZE]; +L1_DATA static unsigned char loc[MAX_BUFFER_SIZE]; + +#define EXT_DATA_ADDR ((unsigned int) ext) +#define TCDM_DATA_ADDR ((unsigned int) loc) +typedef enum {RX, TX} test_type_t; + +int testMCHAN(unsigned int len, test_type_t type, unsigned int ext_addr, unsigned int tcdm_addr); +int main() +{ + + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + + int error_count = 0; + + + /* if (rt_core_id() == 0) */ +/* { */ +/* if ((ext = rt_alloc(RT_ALLOC_L2_CL_DATA, MAX_BUFFER_SIZE)) == 0) return -1; */ +/* if ((loc = rt_alloc(RT_ALLOC_CL_DATA, MAX_BUFFER_SIZE)) == 0) return -1; */ +/* } */ + + if (get_core_id() == 0){ + + + for ( int i = 5; i < 8045; i=5*i) { + error_count += testMCHAN(i, TX, ext, loc); + } + for ( int i = 5; i < 8045; i=5*i ) { + error_count += testMCHAN(i, RX, ext, loc); + } + + } + + return error_count; + +} + +int testMCHAN(unsigned int len, test_type_t type, unsigned int ext_addr, unsigned int tcdm_addr){ + + volatile unsigned int i,j,id; + volatile unsigned int test,read,error=0; + + if (type == RX){ + +#ifdef VERBOSE + printf ("STARTING TEST FOR RX %d OPERATION: \n", len); +#endif + + for (i=0; i +#include +#include +#include +#include "pulp.h" + +#define VERBOSE + +#define MAX_BUFFER_SIZE 0x2000 + +#define min_size 32 // 256 for wide AXI port +#define max_size 45 + +L2_DATA static uint8_t ext[MAX_BUFFER_SIZE]; // External memory buffer (L2) +L1_DATA static uint8_t loc[MAX_BUFFER_SIZE]; // Local memory buffer (TCDM / L1) + +typedef enum {L2_TO_L1, L1_TO_L2} test_type_t; + +int test_idma(uint32_t size, test_type_t type, uint32_t ext_addr, uint32_t tcdm_addr); + +int main(void) { + if (rt_cluster_id() != 0) { + printf("Test only runs on cluster 0\n"); + return bench_cluster_forward(0); + } + + int error_count = 0; + + if (get_core_id() == 0) { + + // Test for L2_TO_L1 operation + for (int size = min_size; size < max_size; size++) { + error_count += test_idma(size, L2_TO_L1, (uint32_t)ext, (uint32_t)loc); + } + + // Test for L1_TO_L2 operation + for (int size = min_size; size < max_size; size++) { + error_count += test_idma(size, L1_TO_L2, (uint32_t)ext, (uint32_t)loc); + } + } + + return error_count; +} + +int test_idma(uint32_t size, test_type_t type, uint32_t ext_addr, uint32_t tcdm_addr) { + volatile uint8_t expected, actual; + volatile int error = 0; + volatile unsigned int id; + + if (type == L2_TO_L1) { + + for (uint32_t i = 0; i < size; i++) { + *(uint8_t *)(ext_addr + i) = (uint8_t)(i & 0xFF); + } + + memset((void *)tcdm_addr, 0, size + 16); + + id = pulp_idma_memcpy(ext_addr, tcdm_addr, size, IDMA_PROT_AXI, IDMA_PROT_OBI); + + /* L2_TO_L1 transaction: + +----------------+ +----------------+ + | | DMA Transfer | | + | L2 Memory | ---------------------> | L1 Memory | + | (ext buffer) | (pulp_idma_memcpy) | (loc buffer) | + | | | | + +----------------+ +----------------+ + */ + + } else if (type == L1_TO_L2) { + // Fill L1 buffer with a pattern + for (uint32_t i = 0; i < size; i++) { + *(uint8_t *)(tcdm_addr + i) = (uint8_t)(i & 0xFF); + } + + // Clear L2 + memset((void *)ext_addr, 0, size + 16); + + // Perform DMA transfer from local memory to external memory + id = pulp_idma_memcpy(tcdm_addr, ext_addr, size, IDMA_PROT_OBI, IDMA_PROT_AXI); + + /* L1_TO_L2 transaction: + +----------------+ +----------------+ + | | DMA Transfer | | + | L2 Memory | <--------------------- | L1 Memory | + | (ext buffer) | (pulp_idma_memcpy) | (loc buffer) | + | | | | + +----------------+ +----------------+ + */ + } else { + printf("Invalid test type. It must be either L2_TO_L1 or L1_TO_L2.\n"); + return 1; + } + + plp_dma_barrier(); + + // Verify data + for (uint32_t i = 0; i < size; i++) { + expected = (uint8_t)(i & 0xFF); // Same pattern as L2_TO_L1 + if (type == L2_TO_L1) { + // Reading from the local memory buffer "loc" allocated in L1 memory + actual = *(uint8_t *)(tcdm_addr + i); + } else { // L1_TO_L2 + // Reading from the external memory buffer "ext" allocated in L2 memory + actual = *(uint8_t *)(ext_addr + i); + } + + if (expected != actual) { + printf("Error at index %u: Expected 0x%02X, Got 0x%02X\n", i, expected, actual); + error++; + } + } + + if (error == 0) { + printf("Test passed for %s of length %u.\n", type == L2_TO_L1 ? "L2_TO_L1" : "L1_TO_L2", size); + } else { + printf("Test failed for %s of length %u with %u errors.\n", type == L2_TO_L1 ? "L2_TO_L1" : "L1_TO_L2", size, error); + } + + return error; +} \ No newline at end of file diff --git a/idma_tests/idma_wide/wave.do b/idma_tests/idma_wide/wave.do new file mode 100644 index 0000000..e15c840 --- /dev/null +++ b/idma_tests/idma_wide/wave.do @@ -0,0 +1,66 @@ +onerror {resume} +quietly WaveActivateNextPane {} 0 +add wave -noupdate -subitemconfig {/pulp_cluster_tb/cluster_i/s_dma_master_req.ar {-childformat {{size -radix unsigned}} -expand} /pulp_cluster_tb/cluster_i/s_dma_master_req.ar.size {-radix unsigned}} /pulp_cluster_tb/cluster_i/s_dma_master_req +add wave -noupdate -subitemconfig {/pulp_cluster_tb/cluster_i/s_dma_master_resp.r -expand} /pulp_cluster_tb/cluster_i/s_dma_master_resp +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/clk} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/req} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/gnt} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/r_valid} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/r_ready} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/add} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/wen} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/data} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/be} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/user} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/id} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/r_data} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/r_user} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/r_id} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/r_opc} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/ecc} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/r_ecc} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/ereq} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/egnt} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/r_evalid} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/r_eready} +add wave -noupdate -group {s_hci_dma[0]} {/pulp_cluster_tb/cluster_i/s_hci_dma[0]/clk_assert} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/clk} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/req} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/gnt} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/r_valid} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/r_ready} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/add} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/wen} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/data} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/be} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/user} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/id} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/r_data} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/r_user} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/r_id} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/r_opc} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/ecc} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/r_ecc} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/ereq} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/egnt} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/r_evalid} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/r_eready} +add wave -noupdate -expand -group {s_hci_dma[1]} {/pulp_cluster_tb/cluster_i/s_hci_dma[1]/clk_assert} +TreeUpdate [SetDefaultTree] +WaveRestoreCursors {{Cursor 1} {262653051593 ps} 1} {{Cursor 2} {262655244483 ps} 0} +quietly wave cursor active 2 +configure wave -namecolwidth 150 +configure wave -valuecolwidth 567 +configure wave -justifyvalue left +configure wave -signalnamewidth 1 +configure wave -snapdistance 10 +configure wave -datasetprefix 0 +configure wave -rowmargin 4 +configure wave -childrowmargin 2 +configure wave -gridoffset 0 +configure wave -gridperiod 1 +configure wave -griddelta 40 +configure wave -timeline 0 +configure wave -timelineunits ps +update +WaveRestoreZoom {262620992344 ps} {262771194315 ps} diff --git a/idma_tests/idma_zeromem/Makefile b/idma_tests/idma_zeromem/Makefile new file mode 100755 index 0000000..d896e12 --- /dev/null +++ b/idma_tests/idma_zeromem/Makefile @@ -0,0 +1,22 @@ +PULP_APP = test + +TEST_SRCS ?= zeromem.c +PULP_APP_SRCS = $(TEST_SRCS) +PULP_APP_FC_SRCS = $(TEST_FC_SRCS) +ifdef TEST_FC_SRCS +pulpFc=1 +endif + +space := +space += + +#BUILD_DIR = $(subst $(space),_,$(CURDIR)/build/$(TEST_SRCS)) + +ifdef VERBOSE +PULP_CFLAGS += -DVERBOSE +endif + +PULP_CFLAGS += -O3 +stackSize = 4096 + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/idma_tests/idma_zeromem/dma_wave.do b/idma_tests/idma_zeromem/dma_wave.do new file mode 100644 index 0000000..ab54467 --- /dev/null +++ b/idma_tests/idma_zeromem/dma_wave.do @@ -0,0 +1,179 @@ +onerror {resume} +quietly WaveActivateNextPane {} 0 +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/clk_i +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/rst_ni +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/test_mode_i +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/ext_master_req_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/ext_master_resp_i +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/term_event_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/term_irq_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/term_event_pe_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/term_irq_pe_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/busy_o +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_wdata +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_add +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_wen +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_be +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_id +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_gnt +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_r_rdata +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_r_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_r_opc +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/config_r_id +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_req_from_dma +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_req_from_rrc +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_reorg_req_from_dma +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_reorg_req_from_rrc +add wave -noupdate -expand -group {iDMA wrap} -expand -subitemconfig {{/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_req_from_dma[0]} -expand} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_req_from_dma +add wave -noupdate -expand -group {iDMA wrap} -expand -subitemconfig {{/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_req_from_rrc[0]} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_req_from_rrc[0].a} -expand} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_req_from_rrc +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_req_muxed +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_rsp_to_dma +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_rsp_to_rrc +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_reorg_rsp_to_dma +add wave -noupdate -expand -group {iDMA wrap} -subitemconfig {{/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_reorg_rsp_to_rrc[0]} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_reorg_rsp_to_rrc[0].r} -expand} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_reorg_rsp_to_rrc +add wave -noupdate -expand -group {iDMA wrap} -expand -subitemconfig {{/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_rsp_to_dma[0]} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_rsp_to_dma[0].r} -expand} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_rsp_to_dma +add wave -noupdate -expand -group {iDMA wrap} -expand -subitemconfig {{/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_rsp_to_rrc[0]} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_rsp_to_rrc[0].r} -expand} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_write_rsp_to_rrc +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/obi_read_rsp_to_mux +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/soc_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/soc_rsp +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/dma_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/dma_rsp +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/dma_regs_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/dma_regs_rsp +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/stream_idx +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/twod_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/twod_req_queue +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/idma_req +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/idma_rsp +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/one_fe_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/fe_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/twod_queue_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/be_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/be_rsp_valid +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/fe_ready +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/twod_queue_ready +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/be_ready +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/be_rsp_ready +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/trans_complete +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/midend_busy +add wave -noupdate -expand -group {iDMA wrap} -expand /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/idma_busy +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/done_id +add wave -noupdate -expand -group {iDMA wrap} /pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/next_id +add wave -noupdate -expand -group {init strm1} -subitemconfig {{/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_write_rsp.rsp_chan} -expand} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_write_rsp} +add wave -noupdate -expand -group {init strm1} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_write_req} +add wave -noupdate -expand -group {init strm1} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_read_rsp} +add wave -noupdate -expand -group {init strm1} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/init_read_req} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/clk_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/rst_ni} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/testmode_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_read_req_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_read_rsp_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_read_req_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_read_rsp_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_read_req_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_read_rsp_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_write_req_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_write_rsp_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_write_req_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_write_rsp_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_req_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_valid_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_ready_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_rsp_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_valid_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_ready_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_req_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_valid_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_ready_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_valid_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_ready_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/ar_req_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/ar_valid_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/ar_ready_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/aw_req_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/aw_valid_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/aw_ready_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/dp_poison_i} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_chan_ready_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_chan_valid_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/r_dp_busy_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_busy_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_busy_o} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_buffer_in_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_buffer_in_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_buffer_in_valid} +add wave -noupdate -expand -group {to OBI transport layer} -expand {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_valid_shifted} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_buffer_out_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_buffer_out_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_ready_shifted} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_buffer_in} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_buffer_in} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_buffer_in} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_in_shifted} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/buffer_out_shifted} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_chan_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_chan_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_chan_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_chan_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_chan_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_chan_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_dp_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_dp_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_dp_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_dp_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_dp_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_dp_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_r_dp_rsp} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_r_dp_rsp} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_r_dp_rsp} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/axi_ar_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_ar_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_ar_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_rsp_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_rsp_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_rsp_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_rsp_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_w_dp_rsp} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_w_dp_rsp} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/init_aw_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/obi_aw_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_req_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_req_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_mux_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_mux_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_dp_rsp_mux} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_in_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_in_ready} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_out_protocol} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_out_valid} +add wave -noupdate -expand -group {to OBI transport layer} {/pulp_cluster_tb/cluster_i/inst_idma/dmac_wrap_i/gen_streams[1]/gen_cpy_in/i_idma_backend_r_axi_rw_init_rw_obi/i_idma_transport_layer/w_resp_fifo_out_ready} +TreeUpdate [SetDefaultTree] +WaveRestoreCursors {{Cursor 1} {226792000000 ps} 0} +quietly wave cursor active 1 +configure wave -namecolwidth 252 +configure wave -valuecolwidth 120 +configure wave -justifyvalue left +configure wave -signalnamewidth 1 +configure wave -snapdistance 10 +configure wave -datasetprefix 0 +configure wave -rowmargin 4 +configure wave -childrowmargin 2 +configure wave -gridoffset 0 +configure wave -gridperiod 1 +configure wave -griddelta 40 +configure wave -timeline 0 +configure wave -timelineunits ns +update +WaveRestoreZoom {226663267749 ps} {227016368236 ps} diff --git a/idma_tests/idma_zeromem/zeromem.c b/idma_tests/idma_zeromem/zeromem.c new file mode 100644 index 0000000..993e457 --- /dev/null +++ b/idma_tests/idma_zeromem/zeromem.c @@ -0,0 +1,81 @@ +#include +#include "pulp.h" +//#include "mchan_tests.h" + +#define VERBOSE + +#define MAX_BUFFER_SIZE 0x2000 + +//static unsigned char *ext; +//static unsigned char *loc; + +//#define EXT_DATA_ADDR ((unsigned int)ext) +//#define TCDM_DATA_ADDR ((unsigned int)loc) + +L2_DATA static unsigned char ext[MAX_BUFFER_SIZE]; +L1_DATA static unsigned char loc[MAX_BUFFER_SIZE]; + +#define EXT_DATA_ADDR ((unsigned int) ext) +#define TCDM_DATA_ADDR ((unsigned int) loc) +typedef enum {L1_TEST, L2_TEST} test_type_t; + +int zeromem_test(unsigned int len, test_type_t type, unsigned int buf); +int main() +{ + + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + + int error_count_l2 = 0; + int error_count_l1 = 0; + + + + if (get_core_id() == 0){ + + + for ( int i = 5; i < 8045; i=4*i) { + error_count_l1 += zeromem_test(i, L1_TEST, loc); + } + + if (error_count_l1) + printf("OOPS -- got %d errors in L1 zeromem tests!\n", error_count_l1); + else + printf("L1 zeromem tests passed!\n"); + for ( int i = 5; i < 8045; i=4*i ) { + error_count_l2 += zeromem_test(i, L2_TEST, ext); + } + if (error_count_l2) + printf("OOPS -- got %d errors in L2 zeromem tests!\n", error_count_l2); + else + printf("L2 zeromem tests passed!\n"); + + } + + return error_count_l1 + error_count_l2; +} + + +int zeromem_test(unsigned int len, test_type_t type, unsigned int buf) { + unsigned int tx_id; + int error_cnt = 0; + uint8_t * buf_ptr = (uint8_t *) buf; + // fill the buffer with data to make sure it gets erased + for (int i=0; i + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka_siracusa_bsp.h" +#include + +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR (0x00200000) +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS 0x18 +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR \ + (NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR + \ + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS) +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100 +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff +#define NEUREKA_SIRACUSA_MAX_STALL (8) +#define NEUREKA_SIRACUSA_EVENT (1 << 12) +#define NEUREKA_SIRACUSA_BASE_ADDR (0x00201000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_BASE_ADDR (0x10400000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_MRAM_OFFSET (0x00000000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_SRAM_OFFSET (0x00400000) + +void neureka_siracusa_hci_setpriority_neureka() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |= + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; +} + +void neureka_siracusa_hci_setpriority_core() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &= + ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; +} + +void neureka_siracusa_hci_reset_max_stall() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &= + ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; +} + +void neureka_siracusa_hci_set_max_stall(uint32_t max_stall) { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |= + max_stall & NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; +} + +void neureka_siracusa_open(neureka_siracusa_conf_t *conf) { + neureka_siracusa_hci_setpriority_neureka(); + neureka_siracusa_hci_set_max_stall(conf->max_stall); +} + +void neureka_siracusa_close() { + neureka_siracusa_hci_reset_max_stall(); + neureka_siracusa_hci_setpriority_core(); +} + +void neureka_siracusa_event_wait_and_clear() { + eu_evt_maskWaitAndClr(NEUREKA_SIRACUSA_EVENT); +} + +static const neureka_dev_t neureka_siracusa_dev = { + .hwpe_dev = (struct hwpe_dev_t){ + .base_addr = (volatile uint32_t *)NEUREKA_SIRACUSA_BASE_ADDR}}; + +const neureka_dev_t *neureka_siracusa_get_dev() { + return &neureka_siracusa_dev; +} diff --git a/neureka/app/bsp/siracusa/neureka_siracusa_bsp.h b/neureka/app/bsp/siracusa/neureka_siracusa_bsp.h new file mode 100644 index 0000000..458fae7 --- /dev/null +++ b/neureka/app/bsp/siracusa/neureka_siracusa_bsp.h @@ -0,0 +1,69 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_SIRACUSA_BSP_H__ +#define __NEUREKA_SIRACUSA_BSP_H__ + +#include + +#include "neureka.h" +#include + +/** + * neureka_siracusa_setpriority_neureka + * + * Set HCI interconnect bus priority to prioritize neureka. + */ +void neureka_siracusa_hci_setpriority_neureka(); + +/** + * neureka_siracusa_setpriority_core + * + * Set HCI bus priority to prioritize cores. + */ +void neureka_siracusa_hci_setpriority_core(); + +/** + * neureka_siracusa_hci_reset_maxstall + * + * Reset the HCI bus maxstall parameter. + * TODO: Check if it disables it also or just resets? + */ +void neureka_siracusa_hci_reset_max_stall(); + +/** + * neureka_siracusa_hci_set_maxstall + * + * Set the HCI bus maxstall. Maxstall defines how many cycles + * will the HCI bus stall the lower priority master, i.e. neureka or core, + * before letting it do a transaction. + */ +void neureka_siracusa_hci_set_max_stall(uint32_t max_stall); + +typedef struct neureka_siracusa_conf_t { + int max_stall; +} neureka_siracusa_conf_t; + +void neureka_siracusa_open(neureka_siracusa_conf_t *conf); +void neureka_siracusa_close(); +void neureka_siracusa_event_wait_and_clear(); +const neureka_dev_t *neureka_siracusa_get_dev(); + +#endif // !__NEUREKA_SIRACUSA_BSP_H__ diff --git a/neureka/app/bsp/testbench/neureka_testbench_bsp.c b/neureka/app/bsp/testbench/neureka_testbench_bsp.c new file mode 100644 index 0000000..2a5d4a1 --- /dev/null +++ b/neureka/app/bsp/testbench/neureka_testbench_bsp.c @@ -0,0 +1,48 @@ +/* + * Luka Macan + * Francesco Conti + * Arpan Suravi Prasad + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka_testbench_bsp.h" +#include + +#define NEUREKA_TESTBENCH_CLUSTER_CTRL_BASE_ADDR (0x00100000) +#define NEUREKA_TESTBENCH_BASE_ADDR (0x10201000) +#define NEUREKA_TESTBENCH_EVENT (1 << 12) + +void neureka_testbench_open(neureka_testbench_conf_t *conf) { + return; +} + +void neureka_testbench_close() { + return; +} + +void neureka_testbench_event_wait_and_clear() { + eu_evt_maskWaitAndClr(NEUREKA_TESTBENCH_EVENT); +} + +static const neureka_dev_t neureka_testbench_dev = { + .hwpe_dev = (struct hwpe_dev_t){ + .base_addr = (volatile uint32_t *)NEUREKA_TESTBENCH_BASE_ADDR}}; + +const neureka_dev_t *neureka_testbench_get_dev() { + return &neureka_testbench_dev; +} diff --git a/neureka/app/bsp/testbench/neureka_testbench_bsp.h b/neureka/app/bsp/testbench/neureka_testbench_bsp.h new file mode 100644 index 0000000..a6d4bd1 --- /dev/null +++ b/neureka/app/bsp/testbench/neureka_testbench_bsp.h @@ -0,0 +1,37 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_TESTBENCH_BSP_H__ +#define __NEUREKA_TESTBENCH_BSP_H__ + +#include "pulp.h" +#include "neureka.h" +#include + +typedef struct neureka_testbench_conf_t { + int max_stall; +} neureka_testbench_conf_t; + +void neureka_testbench_open(neureka_testbench_conf_t *conf); +void neureka_testbench_close(); +void neureka_testbench_event_wait_and_clear(); +const neureka_dev_t *neureka_testbench_get_dev(); + +#endif // !__NEUREKA_TESTBENCH_BSP_H__ diff --git a/neureka/app/gvsoc/neureka_gvsoc.h b/neureka/app/gvsoc/neureka_gvsoc.h new file mode 100644 index 0000000..37eeab0 --- /dev/null +++ b/neureka/app/gvsoc/neureka_gvsoc.h @@ -0,0 +1,54 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_GVSOC_H__ +#define __NEUREKA_GVSOC_H__ + +#include "neureka.h" +#include "neureka_task.h" + +#define NEUREKA_REG_GVSOC_LOG_LEVEL 24 +#define NEUREKA_REG_GVSOC_LOG_FORMAT 25 + +typedef enum neureka_gvsoc_log_format_e { + NEUREKA_GVSOC_LOG_FORMAT_DECIMAL = 0, + NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL = 3 +} neureka_gvsoc_log_format_e; + +typedef enum neureka_gvsoc_log_level_e { + NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END = 0, + NEUREKA_GVSOC_LOG_LEVEL_CONFIG = 1, + NEUREKA_GVSOC_LOG_LEVEL_ACTIV_INOUT = 2, + NEUREKA_GVSOC_LOG_LEVEL_ALL = 3 +} neureka_gvsoc_log_level_e; + +static void neureka_gvsoc_log_activate(neureka_dev_t *dev, + neureka_gvsoc_log_level_e log_level, + neureka_gvsoc_log_format_e format) { + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, log_level); + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_FORMAT, format); +} + +static void neureka_gvsoc_log_deactivate(neureka_dev_t *dev) { + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, + NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END); +} + +#endif // __NEUREKA_GVSOC_H__ diff --git a/neureka/app/hal/neureka.c b/neureka/app/hal/neureka.c new file mode 100644 index 0000000..dc829d9 --- /dev/null +++ b/neureka/app/hal/neureka.c @@ -0,0 +1,37 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka.h" + +#define NEUREKA_STATUS_EMPTY (0x000) +#define NEUREKA_STATUS_FULL (0x101) + +inline int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev) { + uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev); + return (status & 0x1) + ((status >> 8) & 0x1); +} + +inline int neureka_task_queue_empty(neureka_dev_t *dev) { + return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_EMPTY; +} + +inline int neureka_task_queue_full(neureka_dev_t *dev) { + return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_FULL; +} diff --git a/neureka/app/hal/neureka.h b/neureka/app/hal/neureka.h new file mode 100644 index 0000000..eae77a1 --- /dev/null +++ b/neureka/app/hal/neureka.h @@ -0,0 +1,37 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_H__ +#define __NEUREKA_H__ + +#include "hwpe.h" +#include + +#define NEUREKA_TASK_QUEUE_SIZE (2) + +typedef struct neureka_dev_t { + hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */ +} neureka_dev_t; + +int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev); +int neureka_task_queue_empty(neureka_dev_t *dev); +int neureka_task_queue_full(neureka_dev_t *dev); + +#endif // __NEUREKA_H__ diff --git a/neureka/app/hal/neureka_task.c b/neureka/app/hal/neureka_task.c new file mode 100644 index 0000000..d31c934 --- /dev/null +++ b/neureka/app/hal/neureka_task.c @@ -0,0 +1,248 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka_task.h" +#include "neureka_task_defs.h" +#include "pulp_nnx_util.h" + +uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height, + uint32_t i_width, uint32_t n_height, + uint32_t n_width) { + uint32_t tile_padding = padding; + if (i_height > 0) { + tile_padding &= ~(0xf << 28); + } + if (i_width < n_width - 1) { + tile_padding &= ~(0xf << 24); + } + if (i_height < n_height - 1) { + tile_padding &= ~(0xf << 20); + } + if (i_width > 0) { + tile_padding &= ~(0xf << 16); + } + return tile_padding; +} + +void neureka_task_init(neureka_task_t *task) { + *task = (neureka_task_t){.data = {0}}; +} + +void neureka_task_set_op_to_conv(neureka_task_t *task, + const uint8_t kernel_shape, + const uint8_t depthwise) { + task->depthwise = depthwise; + task->kernel_shape = kernel_shape; + task->subtile_output_channel = depthwise ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 + : NEUREKA_SUBTILE_OUTPUT_CHANNEL; + task->subtile_input_channel = kernel_shape == 3 + ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 + : NEUREKA_SUBTILE_INPUT_CHANNEL_1x1; + + const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1 + : depthwise == 1 ? NEUREKA_FLAG_MODE_3x3_DW + : NEUREKA_FLAG_MODE_3x3; + + task->data.cfg.conf0 &= ~(NEUREKA_MASK_FLAG_MODE); + task->data.cfg.conf0 |= flag_mode; +} + +void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, + const uint8_t weight_bits) { + neureka_quant_mode_e quantMode; + if (output_bits == 8) { + quantMode = quantMode8Bit; + } else { + quantMode = quantMode32Bit; + } + + task->qw = weight_bits; + task->data.cfg.conf0 &= + ~(NEUREKA_MASK_QUANT_MODE | NEUREKA_MASK_FLAG_WEIGHT_BITS); + task->data.cfg.conf0 |= quantMode | (weight_bits - 1); +} + +void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant, + neureka_norm_t norm) { + task->data.cfg.conf0 &= + ~(NEUREKA_MASK_QUANT_FUNCTION | NEUREKA_MASK_SHIFT_AMOUNT | + NEUREKA_MASK_NORM_MODE | NEUREKA_MASK_FLAG_NORM_BIAS | + NEUREKA_MASK_FLAG_NORM_SHIFT); + task->data.cfg.conf0 |= NEUREKA_FLAG_NORM_QUANT | quant.function | + (quant.shift_amount << 16) | norm.mode | + norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS | + norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT; +} + +void neureka_task_set_weight_offset( + neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset) { + task->data.cfg.conf0 &= ~NEUREKA_MASK_WEIGHT_OFFSET_MODE; + task->data.cfg.conf0 |= weight_offset_mode; + task->data.cfg.weight_offset_factor = weight_offset; +} + +void neureka_task_set_input_signed(neureka_task_t *task) { + task->data.cfg.conf0 |= NEUREKA_FLAG_INPUT_SIGNED; +} + +void neureka_task_set_input_unsigned(neureka_task_t *task) { + task->data.cfg.conf0 &= ~NEUREKA_FLAG_INPUT_SIGNED; +} + +void neureka_task_set_weight_source(neureka_task_t *task, + neureka_weight_source_e weight_source) { + task->data.cfg.conf0 &= ~NEUREKA_MASK_FLAG_WEIGHT_SOURCE; + task->data.cfg.conf0 |= weight_source; +} + +void neureka_task_set_activation_prefetch(neureka_task_t *task, + neureka_activation_prefetch_e activation_prefetch) { + task->data.cfg.conf0 &= ~NEUREKA_MASK_FLAG_ACTIVATION_PREFETCH; + task->data.cfg.conf0 |= activation_prefetch; +} + +/** neureka_pad_ptr + * + * Calculate the pointer to the start of the ptr as if + * it was the start to the padded data. + * Necessary for input pointer when it's padded. + */ +uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width, + const uint32_t width_stride, const uint8_t padding_top, + const uint8_t padding_left) { + return ptr - (padding_top * width + padding_left) * width_stride; +} + +void neureka_task_set_ptrs_conv(neureka_task_t *task, uint32_t input_ptr, + uint32_t w_in, uint32_t w_in_stride, + uint8_t padding_top, uint8_t padding_left, + uint32_t output_ptr, uint32_t weights_ptr) { + task->data.infeat_ptr = + neureka_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left); + task->data.outfeat_ptr = output_ptr; + task->data.weights_ptr = weights_ptr; +} + +void neureka_task_set_ptrs_norm_quant(neureka_task_t *task, uint32_t scale_ptr, + uint32_t shift_ptr, uint32_t bias_ptr) { + task->data.scale_ptr = scale_ptr; + task->data.scale_shift_ptr = shift_ptr; + task->data.scale_bias_ptr = bias_ptr; +} + +void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, + const uint32_t w_in_stride, + const uint32_t h_out_stride, + const uint32_t w_out_stride) { + const uint32_t num_k_in = + nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel); + + const neureka_stride_t input_stride = { + .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0}; + task->data.cfg.input_stride = input_stride; + + const neureka_stride_t output_stride = {.d0 = NEUREKA_OUTPUT_BANDWIDTH_BYTES, + .d1 = w_out_stride, + .d2 = h_out_stride}; + task->data.cfg.output_stride = output_stride; + + if (task->kernel_shape == 1) { // 1x1 + task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1; + task->data.cfg.weights_stride.d1 = + NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 * num_k_in; + } else if (!task->depthwise) { // 3x3 + task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3; + task->data.cfg.weights_stride.d1 = + NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3 * task->qw * num_k_in; + } else { // 3x3 depthwise + task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3; + task->data.cfg.weights_stride.d1 = 0; + } + task->data.cfg.weights_stride.d2 = 0; +} + +void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, + const uint8_t padding_bottom, + const uint8_t padding_right) { + const uint16_t num_Ko = + nnx_calculate_number_of_tiles(k_out, task->subtile_output_channel); + const uint16_t num_Ki = + nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel); + const uint16_t num_Ho = + nnx_calculate_number_of_tiles(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT); + const uint16_t num_Wo = + nnx_calculate_number_of_tiles(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH); + + const uint16_t rem_Ko = + nnx_calculate_last_tile_size(k_out, task->subtile_output_channel); + const uint16_t rem_Ki = + nnx_calculate_last_tile_size(k_in, task->subtile_input_channel); + const uint16_t rem_Ho = + nnx_calculate_last_tile_size(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT); + const uint16_t rem_Wo = + nnx_calculate_last_tile_size(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH); + const uint16_t rem_Hi = + (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom; + const uint16_t rem_Wi = + (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right; + + const neureka_subtile_t subtile = { + .number = {.KoKi = nnx_concat_half(num_Ko, num_Ki), + .HoWo = nnx_concat_half(num_Ho, num_Wo)}, + .remainder = {.KoKi = nnx_concat_half(rem_Ko, rem_Ki), + .HoWo = nnx_concat_half(rem_Ho, rem_Wo), + .HiWi = nnx_concat_half(rem_Hi, rem_Wi)}}; + task->data.cfg.subtile = subtile; +} + +void neureka_task_set_padding(neureka_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value) { + task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) | + ((bottom & 0xf) << 20) | ((left & 0xf) << 16) | + (value & 0xff); +} + +void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right) { + task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) | + ((bottom & 0xff) << 8) | ((left & 0xff) << 0); +} + +void neureka_task_set_dims( + neureka_task_t *task, const uint32_t w_in, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t h_out_stride, const uint32_t w_out_stride, + const uint8_t padding_top, const uint8_t padding_bottom, + const uint8_t padding_left, const uint8_t padding_right) { + neureka_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride, + w_out_stride); + neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom, + padding_right); + neureka_task_set_padding(task, padding_top, padding_bottom, padding_left, + padding_right, 0); +} diff --git a/neureka/app/hal/neureka_task.h b/neureka/app/hal/neureka_task.h new file mode 100644 index 0000000..4022fc0 --- /dev/null +++ b/neureka/app/hal/neureka_task.h @@ -0,0 +1,192 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_TASK_H__ +#define __NEUREKA_TASK_H__ + +#include "neureka_task_defs.h" +#include + +typedef enum neureka_task_flag_e { + neurekaTaskFlagFalse = 0, + neurekaTaskFlagTrue = 1 +} neureka_task_flag_e; + +typedef enum neureka_weight_source_e { + neurekaWeightSourceTcdm = NEUREKA_FLAG_WEIGHT_SOURCE_TCDM, + neurekaWeightSourceWmem = NEUREKA_FLAG_WEIGHT_SOURCE_WMEM +} neureka_weight_source_e; + +typedef enum neureka_weight_offset_mode_e { + weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC, + weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE +} neureka_weight_offset_mode_e; + +typedef enum neureka_activation_prefetch_e { + activationPrefetchOn = NEUREKA_FLAG_ACTIVATION_PREFETCH_ON, + activationPrefetchOff = NEUREKA_FLAG_ACTIVATION_PREFETCH_OFF +} neureka_activation_prefetch_e; + +typedef enum { + normMode8Bit = NEUREKA_NORM_MODE_8BIT, + normMode32Bit = NEUREKA_NORM_MODE_32BIT +} neureka_norm_mode_e; + +typedef struct neureka_norm_t { + neureka_norm_mode_e mode; + neureka_task_flag_e flag_bias; + neureka_task_flag_e flag_shift; +} neureka_norm_t; + +typedef enum neureka_quant_mode_e { + quantMode8Bit = NEUREKA_QUANT_MODE_8BIT, + quantMode32Bit = NEUREKA_QUANT_MODE_32BIT +} neureka_quant_mode_e; + +typedef enum neureka_quant_function_e { + quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY, + quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU +} neureka_quant_function_e; + +typedef struct neureka_quant_t { + // Shift amount must be in range 0x00-0x1F + uint8_t shift_amount; + neureka_quant_function_e function; + neureka_task_flag_e flag_rounding; +} neureka_quant_t; + +typedef struct neureka_stride_t { + uint32_t d0; + uint32_t d1; + uint32_t d2; +} neureka_stride_t; + +typedef struct neureka_subtile_remainder_t { + uint32_t KoKi; + uint32_t HoWo; + uint32_t HiWi; +} neureka_subtile_remainder_t; + +typedef struct neureka_subtile_number_t { + uint32_t KoKi; + uint32_t HoWo; +} neureka_subtile_number_t; + +typedef struct neureka_subtile_t { + neureka_subtile_remainder_t remainder; + neureka_subtile_number_t number; +} neureka_subtile_t; + +typedef struct neureka_cfg_t { + neureka_stride_t input_stride; + neureka_stride_t output_stride; + neureka_stride_t weights_stride; + neureka_subtile_t subtile; + uint32_t padding; + uint32_t weight_offset_factor; + uint32_t filter_mask; + uint32_t conf0; +} neureka_cfg_t; + +typedef struct neureka_task_data_t { + uint32_t weights_ptr; + uint32_t infeat_ptr; + uint32_t outfeat_ptr; + uint32_t scale_ptr; + uint32_t scale_shift_ptr; + uint32_t scale_bias_ptr; + neureka_cfg_t cfg; +} neureka_task_data_t; + +typedef struct neureka_task_t { + neureka_task_data_t data; + uint8_t qw; + uint8_t subtile_output_channel; + uint8_t subtile_input_channel; + uint8_t kernel_shape; + uint8_t depthwise; + uint8_t id; +} neureka_task_t; + +void neureka_task_init(neureka_task_t *task); +void neureka_task_set_op_to_conv(neureka_task_t *task, + const uint8_t kernel_shape, + const uint8_t depthwise); +void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, + const uint8_t weight_bits); +void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant, + neureka_norm_t norm); +void neureka_task_set_weight_offset( + neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset); +void neureka_task_set_input_signed(neureka_task_t *task); +void neureka_task_set_input_unsigned(neureka_task_t *task); +void neureka_task_set_weight_source(neureka_task_t *task, + neureka_weight_source_e weight_source); +uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height, + uint32_t i_width, uint32_t n_height, + uint32_t n_width); +uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width, + const uint32_t width_stride, const uint8_t padding_top, + const uint8_t padding_left); +void neureka_task_set_ptrs_conv(neureka_task_t *task, uint32_t input_ptr, + uint32_t w_in, uint32_t w_in_stride, + uint8_t padding_top, uint8_t padding_left, + uint32_t output_ptr, uint32_t weights_ptr); +void neureka_task_set_ptrs_norm_quant(neureka_task_t *task, uint32_t scale_ptr, + uint32_t shift_ptr, uint32_t bias_ptr); +/** neureka_task_set_strides + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the N-EUREKA requires the channels to be contiguous. + */ +void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, + const uint32_t w_in_stride, + const uint32_t h_out_stride, + const uint32_t w_out_stride); +void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, + const uint8_t padding_bottom, + const uint8_t padding_right); +void neureka_task_set_padding(neureka_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value); +void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right); +/** neureka_task_set_dims + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the N-EUREKA requires the channels to be contiguous. + */ +void neureka_task_set_dims( + neureka_task_t *task, const uint32_t w_in, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t h_out_stride, const uint32_t w_out_stride, + const uint8_t padding_top, const uint8_t padding_bottom, + const uint8_t padding_left, const uint8_t padding_right); + +#endif // !__NEUREKA_TASK_H__ diff --git a/neureka/app/hal/neureka_task_defs.h b/neureka/app/hal/neureka_task_defs.h new file mode 100644 index 0000000..7417347 --- /dev/null +++ b/neureka/app/hal/neureka_task_defs.h @@ -0,0 +1,131 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_DEFS_H__ +#define __NEUREKA_DEFS_H__ + +/* ARCHITECTURE */ + +#define NNX_NEUREKA_PE_H (4) +#define NNX_NEUREKA_PE_W (4) +#define NNX_NEUREKA_BANDWIDTH_1x1 (256) +#define NNX_NEUREKA_BANDWIDTH_3x3 (288) + +#define NEUREKA_SUBTILE_INPUT_HEIGHT_1x1 (NNX_NEUREKA_PE_H) +#define NEUREKA_SUBTILE_INPUT_WIDTH_1x1 (NNX_NEUREKA_PE_W) +#define NEUREKA_SUBTILE_INPUT_CHANNEL_1x1 (32) + +#define NEUREKA_SUBTILE_INPUT_HEIGHT_3x3 (NNX_NEUREKA_PE_H+2) +#define NEUREKA_SUBTILE_INPUT_WIDTH_3x3 (NNX_NEUREKA_PE_W+2) +#define NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 (32) + +#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (NNX_NEUREKA_PE_H) +#define NEUREKA_SUBTILE_OUTPUT_WIDTH (NNX_NEUREKA_PE_W) +#define NEUREKA_SUBTILE_OUTPUT_CHANNEL (32) + +#define NEUREKA_OUTPUT_BANDWIDTH_BYTES (32) +#define NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 (NNX_NEUREKA_BANDWIDTH_1x1/8) +#define NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3 (NNX_NEUREKA_BANDWIDTH_3x3/8) + +/* TASK REGISTERS */ + +// job configuration +#define NEUREKA_REG_WEIGHTS_PTR 0 +#define NEUREKA_REG_INFEAT_PTR 1 +#define NEUREKA_REG_OUTFEAT_PTR 2 +#define NEUREKA_REG_SCALE_PTR 3 +#define NEUREKA_REG_SCALE_SHIFT_PTR 4 +#define NEUREKA_REG_SCALE_BIAS_PTR 5 +#define NEUREKA_REG_INFEAT_D0_STRIDE 6 +#define NEUREKA_REG_INFEAT_D1_STRIDE 7 +#define NEUREKA_REG_INFEAT_D2_STRIDE 8 +#define NEUREKA_REG_OUTFEAT_D0_STRIDE 9 +#define NEUREKA_REG_OUTFEAT_D1_STRIDE 10 +#define NEUREKA_REG_OUTFEAT_D2_STRIDE 11 +#define NEUREKA_REG_WEIGHTS_D0_STRIDE 12 +#define NEUREKA_REG_WEIGHTS_D1_STRIDE 13 +#define NEUREKA_REG_WEIGHTS_D2_STRIDE 14 +#define NEUREKA_REG_SUBTILE_REMAINDER_0 15 +#define NEUREKA_REG_SUBTILE_REMAINDER_1 16 +#define NEUREKA_REG_SUBTILE_REMAINDER_2 17 +#define NEUREKA_REG_SUBTILE_NUMBER_0 18 +#define NEUREKA_REG_SUBTILE_NUMBER_1 19 +#define NEUREKA_REG_PADDING 20 +#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 21 +#define NEUREKA_REG_FILTER_MASKING 22 +#define NEUREKA_REG_CONF0 23 + +/* SHIFT */ + +#define NEUREKA_SHIFT_FLAG_INPUT_SIGNED (26) +#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25) +#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24) +#define NEUREKA_SHIFT_QUANT_SHIFT (16) + +/* CONF0 FLAGS */ + +#define NEUREKA_FLAG_INPUT_SIGNED (1 << 26) +#define NEUREKA_FLAG_NORM_BIAS (1 << 25) +#define NEUREKA_FLAG_NORM_SHIFT (1 << 24) +#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23) +#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23) +#define NEUREKA_QUANT_MODE_8BIT (0 << 21) +#define NEUREKA_QUANT_MODE_32BIT (2 << 21) +// conf0[20:16] - quantization shift amount +#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) // Unimplemented in gvsoc +#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE \ + (1 << 15) // Unimplemented in gvsoc +#define NEUREKA_FLAG_STREAMIN (1 << 14) +#define NEUREKA_NORM_MODE_8BIT (0 << 12) +#define NEUREKA_NORM_MODE_32BIT (2 << 12) +#define NEUREKA_FLAG_ACTIVATION_PREFETCH_ON (1 << 10) +#define NEUREKA_FLAG_ACTIVATION_PREFETCH_OFF (0 << 10) +#define NEUREKA_FLAG_WEIGHT_SOURCE_WMEM (1 << 9) +#define NEUREKA_FLAG_WEIGHT_SOURCE_TCDM (0 << 9) +#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) // not tested +#define NEUREKA_FLAG_MODE_3x3 (0 << 5) +#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5) +#define NEUREKA_FLAG_MODE_1x1 (2 << 5) +#define NEUREKA_FLAG_NORM_QUANT (1 << 4) + +/* Masks */ + +#define NEUREKA_MASK_FLAG_INPUT_SIGNED (0x1 << 26) +#define NEUREKA_MASK_FLAG_NORM_BIAS (0x1 << 25) +#define NEUREKA_MASK_FLAG_NORM_SHIFT (0x1 << 24) +#define NEUREKA_MASK_QUANT_FUNCTION (0x1 << 23) +#define NEUREKA_MASK_QUANT_MODE (0x3 << 21) +#define NEUREKA_MASK_SHIFT_AMOUNT (0x1f << 16) +#define NEUREKA_MASK_WEIGHT_OFFSET_MODE (0x1 << 15) +#define NEUREKA_MASK_NORM_MODE (0x3 << 12) +#define NEUREKA_MASK_FLAG_ACTIVATION_PREFETCH (0x1 << 10) +#define NEUREKA_MASK_FLAG_WEIGHT_SOURCE (0x1 << 9) +#define NEUREKA_MASK_FLAG_MODE (0x3 << 5) +#define NEUREKA_MASK_FLAG_WEIGHT_BITS (0x7 << 0) + +/* PADDING */ + +#define NEUREKA_DONT_PAD (0) +#define NEUREKA_MAX_PAD (2) + +/* NORM */ +#define NEUREKA_NORM_MAX_LEN (32) + +#endif // __NEUREKA_DEFS_H__ diff --git a/neureka/app/inc/layer_util.h b/neureka/app/inc/layer_util.h new file mode 100644 index 0000000..e44ede9 --- /dev/null +++ b/neureka/app/inc/layer_util.h @@ -0,0 +1,40 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __LAYER_UTIL_H__ +#define __LAYER_UTIL_H__ + +#include "layer_conf.h" +#include + +static void layer_info() { + printf("Layer info:\n" + " - input: (%dx%dx%d)\n" + " - output: (%dx%dx%d)\n" + " - weight: (%dx%dx%dx%d)\n" + " - stride: (%dx%d)\n" + " - padding: (%dx%dx%dx%d)\n", + INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, + OUTPUT_CHANNEL, WEIGHT_CHANNEL_OUT, WEIGHT_HEIGHT, WEIGHT_WIDTH, + WEIGHT_CHANNEL_IN, STRIDE_HEIGHT, STRIDE_WIDTH, PADDING_TOP, + PADDING_BOTTOM, PADDING_LEFT, PADDING_RIGHT); +} + +#endif // __LAYER_UTIL_H__ diff --git a/neureka/app/inc/nnx_layer.h b/neureka/app/inc/nnx_layer.h new file mode 100644 index 0000000..cbaf4b9 --- /dev/null +++ b/neureka/app/inc/nnx_layer.h @@ -0,0 +1,26 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NNX_LAYER_H__ +#define __NNX_LAYER_H__ + +void execute_nnx_layer(void *unused_args); + +#endif // __NNX_LAYER_H__ diff --git a/neureka/app/inc/pmsis.h b/neureka/app/inc/pmsis.h new file mode 100644 index 0000000..1f16f1b --- /dev/null +++ b/neureka/app/inc/pmsis.h @@ -0,0 +1,5 @@ +// fake pmsis.h +#include +#define PI_L1 __attribute__((section(".data_l1"))) +#define PI_L2 __attribute__((section(".data_l1"))) +// #include "tinyprintf.h" \ No newline at end of file diff --git a/neureka/app/inc/tinyprintf.h b/neureka/app/inc/tinyprintf.h new file mode 100644 index 0000000..0af0609 --- /dev/null +++ b/neureka/app/inc/tinyprintf.h @@ -0,0 +1,248 @@ +/* + +Copyright (c) 2004,2012 Kustaa Nyholm / SpareTimeLabs + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Kustaa Nyholm or SpareTimeLabs nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +This modified tinyprintf is distributed under the conditions of the BSD-new license +as explicitly allowed: + -> https://github.com/cjlano/tinyprintf commit 2ee3012 + +They provide a simple and small (+400 loc) printf functionality to +be used in embedded systems. + +I've found them so useful in debugging that I do not bother with a +debugger at all. + +They are distributed in source form, so to use them, just compile them +into your project. + +Two printf variants are provided: printf and the 'sprintf' family of +functions ('snprintf', 'sprintf', 'vsnprintf', 'vsprintf'). + +The formats supported by this implementation are: +'c' 'd' 'i' 'o' 'p' 'u' 's' 'x' 'X'. + +Zero padding and field width are also supported. + +If the library is compiled with 'PRINTF_SUPPORT_LONG' defined, then +the long specifier is also supported. Note that this will pull in some +long math routines (pun intended!) and thus make your executable +noticeably longer. Likewise with 'PRINTF_LONG_LONG_SUPPORT' for the +long long specifier, and with 'PRINTF_SIZE_T_SUPPORT' for the size_t +specifier. + +The memory footprint of course depends on the target CPU, compiler and +compiler options, but a rough guesstimate (based on a H8S target) is about +1.4 kB for code and some twenty 'int's and 'char's, say 60 bytes of stack space. +Not too bad. Your mileage may vary. By hacking the source code you can +get rid of some hundred bytes, I'm sure, but personally I feel the balance of +functionality and flexibility versus code size is close to optimal for +many embedded systems. + +To use the printf, you need to supply your own character output function, +something like : + +void putc ( void* p, char c) +{ + while (!SERIAL_PORT_EMPTY) ; + SERIAL_PORT_TX_REGISTER = c; +} + +Before you can call printf, you need to initialize it to use your +character output function with something like: + +init_printf(NULL,putc); + +Notice the 'NULL' in 'init_printf' and the parameter 'void* p' in 'putc', +the NULL (or any pointer) you pass into the 'init_printf' will eventually be +passed to your 'putc' routine. This allows you to pass some storage space (or +anything really) to the character output function, if necessary. +This is not often needed but it was implemented like that because it made +implementing the sprintf function so neat (look at the source code). + +The code is re-entrant, except for the 'init_printf' function, so it is safe +to call it from interrupts too, although this may result in mixed output. +If you rely on re-entrancy, take care that your 'putc' function is re-entrant! + +The printf and sprintf functions are actually macros that translate to +'tfp_printf' and 'tfp_sprintf' when 'TINYPRINTF_OVERRIDE_LIBC' is set +(default). Setting it to 0 makes it possible to use them along with +'stdio.h' printf's in a single source file. When +'TINYPRINTF_OVERRIDE_LIBC' is set, please note that printf/sprintf are +not function-like macros, so if you have variables or struct members +with these names, things will explode in your face. Without variadic +macros this is the best we can do to wrap these function. If it is a +problem, just give up the macros and use the functions directly, or +rename them. + +It is also possible to avoid defining tfp_printf and/or tfp_sprintf by +clearing 'TINYPRINTF_DEFINE_TFP_PRINTF' and/or +'TINYPRINTF_DEFINE_TFP_SPRINTF' to 0. This allows for example to +export only tfp_format, which is at the core of all the other +functions. + +For further details see source code. + +regs Kusti, 23.10.2004 +*/ + +#ifndef __TFP_PRINTF__ +#define __TFP_PRINTF__ + +#include + +void putf(char *null, char c); + +/* Global configuration */ + +/* Set this to 0 if you do not want to provide tfp_printf */ +#ifndef TINYPRINTF_DEFINE_TFP_PRINTF +# define TINYPRINTF_DEFINE_TFP_PRINTF 1 +#endif + +/* Set this to 0 if you do not want to provide + tfp_sprintf/snprintf/vsprintf/vsnprintf */ +#ifndef TINYPRINTF_DEFINE_TFP_SPRINTF +# define TINYPRINTF_DEFINE_TFP_SPRINTF 1 +#endif + +/* Set this to 0 if you do not want tfp_printf and + tfp_{vsn,sn,vs,s}printf to be also available as + printf/{vsn,sn,vs,s}printf */ +#ifndef TINYPRINTF_OVERRIDE_LIBC +# define TINYPRINTF_OVERRIDE_LIBC 1 +#endif + +/* Optional external types dependencies */ + +#if TINYPRINTF_DEFINE_TFP_SPRINTF +# include /* size_t */ +#endif + +/* Declarations */ + +#ifdef __GNUC__ +# define _TFP_SPECIFY_PRINTF_FMT(fmt_idx,arg1_idx) \ + __attribute__((format (printf, fmt_idx, arg1_idx))) +#else +# define _TFP_SPECIFY_PRINTF_FMT(fmt_idx,arg1_idx) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*putcf) (void *, char); + +/* + 'tfp_format' really is the central function for all tinyprintf. For + each output character after formatting, the 'putf' callback is + called with 2 args: + - an arbitrary void* 'putp' param defined by the user and + passed unmodified from 'tfp_format', + - the character. + The 'tfp_printf' and 'tfp_sprintf' functions simply define their own + callback and pass to it the right 'putp' it is expecting. +*/ +void tfp_format(void *putp, putcf putf, const char *fmt, va_list va); + +#if TINYPRINTF_DEFINE_TFP_SPRINTF +int tfp_vsnprintf(char *str, size_t size, const char *fmt, va_list ap); +int tfp_snprintf(char *str, size_t size, const char *fmt, ...) \ + _TFP_SPECIFY_PRINTF_FMT(3, 4); +int tfp_vsprintf(char *str, const char *fmt, va_list ap); +int tfp_sprintf(char *str, const char *fmt, ...) \ + _TFP_SPECIFY_PRINTF_FMT(2, 3); +# if TINYPRINTF_OVERRIDE_LIBC +# define vsnprintf tfp_vsnprintf +# define snprintf tfp_snprintf +# define vsprintf tfp_vsprintf +# define sprintf tfp_sprintf +# endif +#endif + +#if TINYPRINTF_DEFINE_TFP_PRINTF +void init_printf(void *putp, putcf putf); +void tfp_printf(char *fmt, ...) _TFP_SPECIFY_PRINTF_FMT(1, 2); +# if TINYPRINTF_OVERRIDE_LIBC +# ifndef DISABLE_PRINTF +# define printf tfp_printf +# else +# define printf(...) +# endif +# endif +#endif + +#ifdef __cplusplus +} +#endif + +#endif + + + +/* + * Configuration + */ + +/* Enable long int support */ +// #define PRINTF_LONG_SUPPORT + +/* Enable long long int support (implies long int support) */ +// #define PRINTF_LONG_LONG_SUPPORT + +/* Enable %z (size_t) support */ +// #define PRINTF_SIZE_T_SUPPORT + +/* + * Configuration adjustments + */ +#ifdef PRINTF_SIZE_T_SUPPORT +#include +#endif + +#ifdef PRINTF_LONG_LONG_SUPPORT +# define PRINTF_LONG_SUPPORT +#endif + +/* __SIZEOF___ defined at least by gcc */ +#ifdef __SIZEOF_POINTER__ +# define SIZEOF_POINTER __SIZEOF_POINTER__ +#endif +#ifdef __SIZEOF_LONG_LONG__ +# define SIZEOF_LONG_LONG __SIZEOF_LONG_LONG__ +#endif +#ifdef __SIZEOF_LONG__ +# define SIZEOF_LONG __SIZEOF_LONG__ +#endif +#ifdef __SIZEOF_INT__ +# define SIZEOF_INT __SIZEOF_INT__ +#endif + +#ifdef __GNUC__ +# define _TFP_GCC_NO_INLINE_ __attribute__ ((noinline)) +#else +# define _TFP_GCC_NO_INLINE_ +#endif + diff --git a/neureka/app/pulp_nnx_neureka.c b/neureka/app/pulp_nnx_neureka.c new file mode 100644 index 0000000..b814f23 --- /dev/null +++ b/neureka/app/pulp_nnx_neureka.c @@ -0,0 +1,76 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "pulp_nnx_neureka.h" +#include "hwpe.h" +#include "neureka.h" +#include "pulp_nnx_util.h" +#include +#include +#include + +void neureka_nnx_init(neureka_dev_t *dev, neureka_bsp_conf_t *conf) { + neureka_bsp_open(conf); + hwpe_soft_clear(&dev->hwpe_dev); +} + +void neureka_nnx_term(neureka_dev_t *dev) { + hwpe_soft_clear(&dev->hwpe_dev); + neureka_bsp_close(); +} + +int neureka_nnx_dispatch_check(neureka_dev_t *dev) { + return !neureka_task_queue_full(dev); +} + +void neureka_nnx_dispatch_wait(neureka_dev_t *dev) { + while (!neureka_nnx_dispatch_check(dev)) { + neureka_bsp_event_wait_and_clear(); + } +} + +int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task) { + if (hwpe_task_queue_acquire_task(&dev->hwpe_dev, &task->id)) { + return 1; + } + hwpe_task_queue_write_task(&dev->hwpe_dev, (uint32_t *)&task->data, + (int)(sizeof(neureka_task_data_t) / 4)); + hwpe_task_queue_release_and_run(&dev->hwpe_dev); + return 0; +} + +int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task) { +#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC + // GVSOC model has a broken running_id so resolve_check + // conservativly looks if the task queue is empty. + return neureka_task_queue_empty(dev); +#else + uint8_t prev_task_id = task->id - 1; + return !(hwpe_last_task_id(&dev->hwpe_dev) == prev_task_id || + (hwpe_last_task_id(&dev->hwpe_dev) == task->id && + !neureka_task_queue_empty(dev))); +#endif +} + +void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task) { + while (!neureka_nnx_resolve_check(dev, task)) { + neureka_bsp_event_wait_and_clear(); + } +} diff --git a/neureka/app/pulp_nnx_neureka.h b/neureka/app/pulp_nnx_neureka.h new file mode 100644 index 0000000..80096c8 --- /dev/null +++ b/neureka/app/pulp_nnx_neureka.h @@ -0,0 +1,61 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka.h" +#include "neureka_bsp.h" +#include "neureka_task.h" +#include + +/* PULP-NNX interface */ + +void neureka_nnx_init(neureka_dev_t *dev, neureka_bsp_conf_t *conf); +void neureka_nnx_term(neureka_dev_t *dev); + +/** neureka_nnx_dispatch_check + * + * Check whether you can dispatch to the accelerator. + */ +int neureka_nnx_dispatch_check(neureka_dev_t *dev); + +/** neureka_nnx_dispatch_wait + * + * Block until you can dispatch to the accelerator. + */ +void neureka_nnx_dispatch_wait(neureka_dev_t *dev); + +/** neureka_nnx_dispatch + * + * Dispatch a task to the accelerator. + * Fails with return code 1 if the task cannot be dispatched. Otherwise returns + * 0. + */ +int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task); + +/** neureka_nnx_resolve_check + * + * Check whether the task has been resolved. + */ +int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task); + +/** neureka_nnx_resolve_wait + * + * Block until you can resolve the task. + */ +void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task); diff --git a/neureka/app/src/main.c b/neureka/app/src/main.c new file mode 100644 index 0000000..818ddf7 --- /dev/null +++ b/neureka/app/src/main.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2020-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Authors: Francesco Conti + * Gianna Paulin + * Renzo Andri + * Arpan Suravi Prasad + * Luka Macan + * Main Test Program for N-EUREKA + */ + +#include +#include +#include +#include + +#include "layer_util.h" +#include "nnx_layer.h" +#include "output.h" +#include "input.h" +#include "weight.h" + +int main() { + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + int err = 0; + if(rt_core_id()==0) + { + // execute NNX layer + execute_nnx_layer(NULL); + + printf("Checking outputs"); + + // output checking + err = check_output(); + + } + synch_barrier(); + return err; + +} diff --git a/neureka/app/src/nnx_layer.c b/neureka/app/src/nnx_layer.c new file mode 100644 index 0000000..f38e830 --- /dev/null +++ b/neureka/app/src/nnx_layer.c @@ -0,0 +1,168 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "nnx_layer.h" +#include + +#include "neureka.h" +#include "neureka_gvsoc.h" +#include "neureka_testbench_bsp.h" +#include "neureka_task.h" +#include "pulp_nnx_neureka.h" + +#define NULL 0 + +typedef neureka_norm_mode_e nnx_norm_mode_e; +typedef neureka_quant_t nnx_quant_t; +typedef neureka_quant_function_e nnx_quant_function_e; +typedef neureka_norm_t nnx_norm_t; +typedef neureka_task_t nnx_task_t; +typedef neureka_dev_t nnx_dev_t; +typedef neureka_testbench_conf_t nnx_bsp_conf_t; +typedef neureka_task_flag_e nnx_task_flag_e; + +#define nnxTaskFlagTrue neurekaTaskFlagTrue +#define nnxTaskFlagFalse neurekaTaskFlagFalse + +#define nnx_task_init neureka_task_init +#define nnx_task_set_op_to_conv neureka_task_set_op_to_conv +#define nnx_task_set_bits neureka_task_set_bits +#define nnx_task_set_norm_quant neureka_task_set_norm_quant +#define nnx_task_set_weight_offset neureka_task_set_weight_offset +#define nnx_task_set_weight_source neureka_task_set_weight_source +#define nnx_task_set_activation_prefetch neureka_task_set_activation_prefetch +#define nnx_task_set_dims neureka_task_set_dims +#define nnx_task_set_ptrs_conv neureka_task_set_ptrs_conv +#define nnx_task_set_ptrs_norm_quant neureka_task_set_ptrs_norm_quant + +#define nnx_bsp_get_dev neureka_testbench_get_dev + +#define nnx_init neureka_nnx_init +#define nnx_dispatch_wait neureka_nnx_dispatch_wait +#define nnx_dispatch neureka_nnx_dispatch +#define nnx_resolve_wait neureka_nnx_resolve_wait +#define nnx_term neureka_nnx_term + +// Generated headers +#include "bias.h" +#include "input.h" +#include "layer_conf.h" +#include "output.h" +#include "scale.h" +#include "weight.h" + +static void task_prepare(nnx_task_t *task) { + nnx_task_init(task); + nnx_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1); + nnx_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS); + + nnx_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET); + +#ifdef NEUREKA_WEIGHT_SOURCE_WMEM + nnx_task_set_weight_source(task, neurekaWeightSourceWmem); + nnx_task_set_activation_prefetch(task, activationPrefetchOn); +#else + neureka_task_set_weight_source(task, neurekaWeightSourceTcdm); + nnx_task_set_activation_prefetch(task, activationPrefetchOff); +#endif +#if INPUT_SIGNED == 1 + neureka_task_set_input_signed(task); +#else + neureka_task_set_input_unsigned(task); +#endif + + const uint32_t w_in_stride = INPUT_CHANNEL * INPUT_BITS / 8; + const uint32_t h_in_stride = INPUT_WIDTH * w_in_stride; + const uint32_t w_out_stride = OUTPUT_CHANNEL * OUTPUT_BITS / 8; + const uint32_t h_out_stride = OUTPUT_WIDTH * w_out_stride; + +#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2 + nnx_task_set_dims_stride2x2( + task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride, + OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, w_out_stride, + WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM, PADDING_LEFT, + PADDING_RIGHT); +#else + nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride, + OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, + w_out_stride, PADDING_TOP, PADDING_BOTTOM, PADDING_LEFT, + PADDING_RIGHT); +#endif + + nnx_task_set_ptrs_conv(task, (uint32_t)input, INPUT_WIDTH, w_in_stride, + PADDING_TOP, PADDING_LEFT, (uint32_t)output, + (uint32_t)weight); +#if HAS_NORM_QUANT == 1 +#if SCALE_BITS == 8 + const nnx_norm_mode_e normMode = normMode8Bit; +#elif SCALE_BITS == 32 + const nnx_norm_mode_e normMode = normMode32Bit; +#endif + + const nnx_task_flag_e flag_bias = + HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse; + const uint32_t bias_ptr = (uint32_t)(HAS_BIAS ? bias : NULL); + + nnx_quant_function_e quant_function = + HAS_RELU ? quantFunctionRelu : quantFunctionIdentity; + + nnx_task_set_norm_quant(task, + (nnx_quant_t){.shift_amount = OUTSHIFT, + .function = quant_function, + .flag_rounding = nnxTaskFlagFalse}, + (nnx_norm_t){.mode = normMode, + .flag_bias = flag_bias, + .flag_shift = nnxTaskFlagFalse}); + + nnx_task_set_ptrs_norm_quant(task, (uint32_t)scale, NULL, bias_ptr); +#endif // HAS_NORM_QUANT +} + +static void task_execute(nnx_task_t *task) { + nnx_dev_t *dev = nnx_bsp_get_dev(); + + nnx_bsp_conf_t conf = {.max_stall = 8}; + nnx_init(dev, &conf); + + nnx_dispatch_wait(dev); + + // printf("CFG:\n"); + // for (int i=0; idata)[i]); + // } +#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2 + nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT, + OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, + WEIGHT_WIDTH); +#else + nnx_dispatch(dev, task); +#endif + + nnx_resolve_wait(dev, task); + + nnx_term(dev); + +} + +void execute_nnx_layer(void *args) { + nnx_task_t task; + task_prepare(&task); + task_execute(&task); +} diff --git a/neureka/app/util/hwpe.c b/neureka/app/util/hwpe.c new file mode 100644 index 0000000..0430081 --- /dev/null +++ b/neureka/app/util/hwpe.c @@ -0,0 +1,85 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "hwpe.h" +#include + +#define HWPE_TRIGGER 0 +#define HWPE_ACQUIRE 1 +#define HWPE_FINISHED 2 +#define HWPE_STATUS 3 +#define HWPE_RUNNING_JOB 4 +#define HWPE_SOFT_CLEAR 5 +#define HWPE_SWSYNC 6 +#define HWPE_TASK_REG_OFFSET 8 + +inline void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) { + dev->base_addr[reg] = value; +} + +inline uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg) { + return dev->base_addr[reg]; +} + +inline void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) { + hwpe_reg_write(dev, HWPE_TASK_REG_OFFSET + reg, value); +} + +inline uint32_t hwpe_task_reg_read(hwpe_dev_t *dev, int reg) { + return hwpe_reg_read(dev, HWPE_TASK_REG_OFFSET + reg); +} + +void hwpe_soft_clear(hwpe_dev_t *dev) { + hwpe_reg_write(dev, HWPE_SOFT_CLEAR, 0); + for (volatile int i = 0; i < 10; i++) + ; +} + +uint32_t hwpe_task_queue_status(hwpe_dev_t *dev) { + return hwpe_reg_read(dev, HWPE_STATUS); +} + +int hwpe_task_queue_acquire_task(hwpe_dev_t *dev, uint8_t *id) { + uint32_t read_value = (int32_t)hwpe_reg_read(dev, HWPE_ACQUIRE); + if (read_value >= 256) { + return 1; + } else { + *id = (uint8_t)read_value; + return 0; + } +} + +void hwpe_task_queue_write_task(hwpe_dev_t *dev, uint32_t *data, int len) { + for (int i = 0; i < len; i++) { + hwpe_task_reg_write(dev, i, data[i]); + } +} + +void hwpe_task_queue_release_and_run(hwpe_dev_t *dev) { + hwpe_reg_write(dev, HWPE_TRIGGER, 0); +} + +void hwpe_task_queue_release(hwpe_dev_t *dev) { + hwpe_reg_write(dev, HWPE_TRIGGER, 1); +} + +uint8_t hwpe_last_task_id(hwpe_dev_t *dev) { + return (uint8_t)hwpe_reg_read(dev, HWPE_RUNNING_JOB); +} diff --git a/neureka/app/util/hwpe.h b/neureka/app/util/hwpe.h new file mode 100644 index 0000000..52bf912 --- /dev/null +++ b/neureka/app/util/hwpe.h @@ -0,0 +1,43 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __HWPE_H__ +#define __HWPE_H__ + +#include + +/* HWPE device */ +typedef struct hwpe_dev_t { + volatile uint32_t *base_addr; +} hwpe_dev_t; + +void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value); +uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg); +void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value); +uint32_t hwpe_task_reg_read(hwpe_dev_t *dev, int reg); +void hwpe_soft_clear(hwpe_dev_t *dev); +uint32_t hwpe_task_queue_status(hwpe_dev_t *dev); +int hwpe_task_queue_acquire_task(hwpe_dev_t *dev, uint8_t *id); +void hwpe_task_queue_write_task(hwpe_dev_t *dev, uint32_t *data, int len); +void hwpe_task_queue_release_and_run(hwpe_dev_t *dev); +void hwpe_task_queue_release(hwpe_dev_t *dev); +uint8_t hwpe_last_task_id(hwpe_dev_t *dev); + +#endif // !__HWPE_H__ diff --git a/neureka/app/util/pulp_nnx_util.c b/neureka/app/util/pulp_nnx_util.c new file mode 100644 index 0000000..0107fc1 --- /dev/null +++ b/neureka/app/util/pulp_nnx_util.c @@ -0,0 +1,35 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "pulp_nnx_util.h" + +inline int nnx_calculate_number_of_tiles(const int dim_size, + const int tile_size) { + return ((dim_size - 1) / tile_size) + 1; +} + +inline int nnx_calculate_last_tile_size(const int dim_size, + const int tile_size) { + return ((dim_size - 1) % tile_size) + 1; +} + +inline uint32_t nnx_concat_half(const uint16_t high, const uint16_t low) { + return ((uint32_t)high << 16) | low; +} diff --git a/neureka/app/util/pulp_nnx_util.h b/neureka/app/util/pulp_nnx_util.h new file mode 100644 index 0000000..d167f6d --- /dev/null +++ b/neureka/app/util/pulp_nnx_util.h @@ -0,0 +1,51 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NNX_UTIL_H__ +#define __NNX_UTIL_H__ + +#include + +/** + * nnx_calculate_number_of_iterations + * + * Calculates the number of iterations to go through a dimension. + * It does it by dividing the dimension with the tile size and doing a ceiling + * the result. + */ +int nnx_calculate_number_of_tiles(const int dim_size, const int tile_size); + +/** + * nnx_calculate_last_tile_size + * + * Calculates the size of the last executed tile by calculating the remainder of + * the dim_size and the tile_size. In case the remainder is 0, it returns the + * full tile_size. + */ +int nnx_calculate_last_tile_size(const int dim_size, const int tile_size); + +/** + * concat_half + * + * Concatenate 2 16-bit numbers into a 32-bit number. + */ +uint32_t nnx_concat_half(const uint16_t high, const uint16_t low); + +#endif // __NNX_UTIL_H__ diff --git a/neureka/dense/Makefile b/neureka/dense/Makefile new file mode 100644 index 0000000..edb77cb --- /dev/null +++ b/neureka/dense/Makefile @@ -0,0 +1,3 @@ +include ../app/Makefile + +STIM_DIR := ../dense/ \ No newline at end of file diff --git a/neureka/dense/inc/bias.h b/neureka/dense/inc/bias.h new file mode 100644 index 0000000..0cc75d8 --- /dev/null +++ b/neureka/dense/inc/bias.h @@ -0,0 +1,9 @@ +#ifndef __BIAS_H__ +#define __BIAS_H__ + +#include + +#define BIAS_SIZE (35) +extern int32_t bias[BIAS_SIZE]; + +#endif // __BIAS_H__ diff --git a/neureka/dense/inc/input.h b/neureka/dense/inc/input.h new file mode 100644 index 0000000..ddb3845 --- /dev/null +++ b/neureka/dense/inc/input.h @@ -0,0 +1,9 @@ +#ifndef __INPUT_H__ +#define __INPUT_H__ + +#include + +#define INPUT_SIZE (1407) +extern uint8_t input[INPUT_SIZE]; + +#endif // __INPUT_H__ diff --git a/neureka/dense/inc/layer_conf.h b/neureka/dense/inc/layer_conf.h new file mode 100644 index 0000000..d005132 --- /dev/null +++ b/neureka/dense/inc/layer_conf.h @@ -0,0 +1,42 @@ +#ifndef __LAYER_CONF_H__ +#define __LAYER_CONF_H__ + +#define TEST_NAME "test" +#define INPUT_HEIGHT (7) +#define INPUT_WIDTH (3) +#define INPUT_CHANNEL (67) +#define INPUT_SIGNED (0) +#define INPUT_BITS (8) + +#define OUTPUT_HEIGHT (5) +#define OUTPUT_WIDTH (1) +#define OUTPUT_CHANNEL (35) +#define OUTPUT_BITS (8) + +#define WEIGHT_HEIGHT (3) +#define WEIGHT_WIDTH (3) +#define WEIGHT_CHANNEL_IN (67) +#define WEIGHT_CHANNEL_OUT (35) +#define WEIGHT_BITS (8) +#define WEIGHT_OFFSET (-128) + +#define SCALE_BITS (8) + +#define BIAS_BITS (32) + +#define PADDING_TOP (0) +#define PADDING_BOTTOM (0) +#define PADDING_LEFT (0) +#define PADDING_RIGHT (0) +#define PADDING_VALUE (0) + +#define STRIDE_HEIGHT (1) +#define STRIDE_WIDTH (1) + +#define GROUPS (1) +#define OUTSHIFT (12) +#define HAS_NORM_QUANT (1) +#define HAS_BIAS (1) +#define HAS_RELU (1) + +#endif // __LAYER_CONF_H__ diff --git a/neureka/dense/inc/output.h b/neureka/dense/inc/output.h new file mode 100644 index 0000000..cb7abf6 --- /dev/null +++ b/neureka/dense/inc/output.h @@ -0,0 +1,14 @@ +#ifndef __OUTPUT_H__ +#define __OUTPUT_H__ + +#include + +#define OUTPUT_SIZE (175) +extern uint8_t output[OUTPUT_SIZE]; + +#define GOLDEN_OUTPUT_SIZE (175) +extern uint8_t golden_output[GOLDEN_OUTPUT_SIZE]; + +int check_output(); + +#endif // __OUTPUT_H__ diff --git a/neureka/dense/inc/scale.h b/neureka/dense/inc/scale.h new file mode 100644 index 0000000..3c546a5 --- /dev/null +++ b/neureka/dense/inc/scale.h @@ -0,0 +1,9 @@ +#ifndef __SCALE_H__ +#define __SCALE_H__ + +#include + +#define SCALE_SIZE (35) +extern uint8_t scale[SCALE_SIZE]; + +#endif // __SCALE_H__ diff --git a/neureka/dense/inc/weight.h b/neureka/dense/inc/weight.h new file mode 100644 index 0000000..635a7b4 --- /dev/null +++ b/neureka/dense/inc/weight.h @@ -0,0 +1,9 @@ +#ifndef __WEIGHT_H__ +#define __WEIGHT_H__ + +#include + +#define WEIGHT_SIZE (30240) +extern uint8_t weight[WEIGHT_SIZE]; + +#endif // __WEIGHT_H__ diff --git a/neureka/dense/src/bias.c b/neureka/dense/src/bias.c new file mode 100644 index 0000000..f6deb22 --- /dev/null +++ b/neureka/dense/src/bias.c @@ -0,0 +1,10 @@ +#include "bias.h" + +#define BIAS_SIZE (35) +PI_L1 int32_t bias[BIAS_SIZE] = { + 0x34f1, 0x26e29, -0x13522, -0x16519, -0x29e74, -0x26137, -0xa2d1, 0x224b4, -0x2b112, -0x5691, + 0x288e, -0x2eec7, 0x31b00, -0x13b14, 0x19292, 0x10f19, 0x2e986, 0x3492f, 0x6d7c, 0x19cbd, + 0x11dd9, -0xecc1, -0x9d88, 0xa5fb, -0x28ff8, -0x24d17, -0x1c933, 0x32120, -0x2ad82, -0x2f262, + -0x14bbf, -0x155b3, 0x18b39, 0x1e105, 0x3041c +}; + diff --git a/neureka/dense/src/input.c b/neureka/dense/src/input.c new file mode 100644 index 0000000..7d83843 --- /dev/null +++ b/neureka/dense/src/input.c @@ -0,0 +1,147 @@ +#include "input.h" + +#define INPUT_SIZE (1407) +PI_L1 uint8_t input[INPUT_SIZE] = { + 0xc2, 0x2c, 0xe2, 0x81, 0x81, 0x2d, 0xa9, 0xb3, 0x4c, 0x8a, + 0x32, 0x75, 0x40, 0xc1, 0xed, 0x7a, 0x9a, 0x7d, 0x5e, 0xdf, + 0xc8, 0x44, 0x83, 0xe8, 0x1e, 0x1b, 0xd2, 0x84, 0x39, 0xc5, + 0x2c, 0x27, 0x9b, 0x78, 0xb3, 0x70, 0x5a, 0x5b, 0xb5, 0x95, + 0x3c, 0x8f, 0xe4, 0x32, 0xfa, 0x12, 0xb7, 0xc7, 0x48, 0x91, + 0x02, 0x33, 0xd0, 0xbe, 0x57, 0xae, 0x61, 0xd0, 0x8a, 0x5d, + 0xd1, 0x72, 0x25, 0x0a, 0x81, 0x35, 0xd7, 0x36, 0xec, 0x04, + 0x37, 0xda, 0x99, 0xf4, 0x28, 0x46, 0xc8, 0xdd, 0x25, 0xae, + 0x37, 0x78, 0xf2, 0x15, 0x48, 0x57, 0xc2, 0x5b, 0xe1, 0x63, + 0x07, 0xab, 0xac, 0x7c, 0x2a, 0xe0, 0xbc, 0x75, 0xa8, 0xf0, + 0x48, 0x74, 0xe1, 0xad, 0xce, 0x17, 0x21, 0xf1, 0x72, 0x3d, + 0x08, 0x00, 0x44, 0x04, 0x46, 0xc7, 0x35, 0x39, 0x5f, 0xf9, + 0xdc, 0xad, 0x25, 0x23, 0xd5, 0x0f, 0xf0, 0x5c, 0x23, 0x08, + 0x25, 0x82, 0xd9, 0x76, 0xe9, 0xd1, 0x10, 0x9b, 0x13, 0x38, + 0x67, 0x8b, 0x96, 0x80, 0x7f, 0xe8, 0x58, 0xbf, 0x9e, 0xf3, + 0x34, 0x20, 0x06, 0xb6, 0xcd, 0x80, 0x3f, 0xf5, 0x2b, 0x95, + 0x90, 0x66, 0xa7, 0xc3, 0xf0, 0x01, 0x13, 0x11, 0x09, 0xbd, + 0x94, 0xfe, 0x52, 0x98, 0xda, 0x19, 0x19, 0x1a, 0x47, 0xca, + 0x5f, 0x7b, 0x64, 0xd8, 0x15, 0xb1, 0xcd, 0x3a, 0xe0, 0x29, + 0x61, 0x93, 0xa6, 0xb9, 0x99, 0x3b, 0x66, 0x46, 0x37, 0x4f, + 0x64, 0x87, 0xb3, 0xe3, 0x1c, 0xdf, 0x20, 0xc4, 0x48, 0xca, + 0x9c, 0xa8, 0x48, 0xd0, 0x52, 0x06, 0x3d, 0x87, 0x29, 0x23, + 0x99, 0x68, 0xed, 0x0a, 0x5c, 0xee, 0x5a, 0xec, 0xd6, 0x6e, + 0x65, 0x75, 0xa7, 0x0f, 0x17, 0x4b, 0xa0, 0xcf, 0xc4, 0xdc, + 0x28, 0xc2, 0x69, 0x4a, 0x61, 0x24, 0x55, 0xa2, 0x0c, 0xcd, + 0x6c, 0x79, 0xf8, 0xa1, 0xc9, 0x39, 0xc0, 0xe2, 0xc8, 0x00, + 0xe5, 0xbe, 0xf0, 0x81, 0x2c, 0xc4, 0xa9, 0x0c, 0xdf, 0xce, + 0xee, 0xe8, 0xb8, 0x62, 0x49, 0x07, 0xaa, 0x3c, 0xed, 0x30, + 0xd9, 0x77, 0x10, 0xd7, 0x32, 0x5e, 0x3d, 0xef, 0x27, 0x13, + 0x50, 0xb5, 0xb5, 0x21, 0x76, 0x14, 0x8f, 0x77, 0x7d, 0xb1, + 0xf8, 0xc3, 0x45, 0xfd, 0x35, 0x59, 0xa4, 0xca, 0xca, 0xbd, + 0x8d, 0xea, 0x83, 0x78, 0xa2, 0x6b, 0x11, 0xef, 0xae, 0x9d, + 0x1f, 0x9d, 0xcc, 0x98, 0x89, 0x22, 0x8b, 0x7d, 0xe8, 0xa7, + 0x61, 0x9e, 0x0c, 0xe3, 0x7e, 0xcc, 0x6d, 0x19, 0xbc, 0x71, + 0x8c, 0x01, 0x2a, 0x03, 0xa7, 0x81, 0x99, 0x12, 0x93, 0xc7, + 0x18, 0x3d, 0x66, 0x37, 0x98, 0x08, 0xf2, 0xb6, 0x0b, 0xa2, + 0x89, 0x65, 0x34, 0x07, 0x6e, 0x09, 0x84, 0xfe, 0x73, 0xf8, + 0x96, 0xbd, 0x09, 0x5c, 0x47, 0x1e, 0x0e, 0xa9, 0x58, 0xe7, + 0x5d, 0xc1, 0xdb, 0xe8, 0x67, 0x40, 0x21, 0x2b, 0x6a, 0x00, + 0x49, 0x57, 0xd1, 0x67, 0x18, 0xfa, 0x79, 0x87, 0xd1, 0x45, + 0x5a, 0xbb, 0x43, 0x3b, 0x2f, 0xd9, 0xbe, 0x8b, 0x61, 0x1f, + 0xc9, 0xa9, 0xe9, 0x10, 0xcb, 0x5b, 0x24, 0x82, 0x30, 0x5a, + 0x77, 0xe4, 0x2f, 0x40, 0x67, 0x55, 0xd1, 0x84, 0x29, 0x91, + 0x6f, 0x4b, 0x21, 0x94, 0xaa, 0x1f, 0x50, 0x1c, 0xc0, 0xb9, + 0x19, 0x0a, 0xd2, 0xe3, 0x7f, 0x91, 0x4d, 0x26, 0x93, 0x3b, + 0x01, 0xfd, 0x69, 0xba, 0x5d, 0xd0, 0x2b, 0x53, 0x6e, 0xd6, + 0x0f, 0x95, 0xde, 0x25, 0xbb, 0x3a, 0x6b, 0x36, 0x50, 0xa3, + 0xf9, 0x3d, 0x4d, 0xb2, 0x59, 0x49, 0xd1, 0xf3, 0x5c, 0x4e, + 0xd4, 0xb9, 0x6d, 0x1a, 0x72, 0x5e, 0x42, 0x92, 0x5b, 0xc8, + 0xdc, 0x89, 0x28, 0xe2, 0xae, 0xee, 0x61, 0xe5, 0x79, 0x3e, + 0x10, 0x7c, 0x62, 0x2b, 0x9b, 0xf6, 0x0c, 0x6e, 0x61, 0xc1, + 0xf1, 0x34, 0x77, 0x52, 0x0f, 0xa2, 0xce, 0x21, 0x27, 0x62, + 0xc6, 0xb3, 0xc1, 0x74, 0x43, 0xeb, 0xcb, 0x74, 0x82, 0x91, + 0xf9, 0x38, 0x81, 0x76, 0x91, 0x50, 0x8e, 0x96, 0x73, 0x14, + 0x61, 0x62, 0x95, 0x6d, 0x01, 0x16, 0x24, 0xb2, 0x66, 0x30, + 0x0e, 0x33, 0x47, 0x37, 0xa4, 0xc8, 0x0f, 0xc7, 0x1d, 0xed, + 0xc4, 0x01, 0x54, 0xa2, 0xbb, 0xfe, 0x15, 0x0c, 0x0d, 0x64, + 0xd5, 0x09, 0x59, 0xf4, 0x3d, 0x46, 0xfb, 0x0c, 0xc6, 0x6d, + 0x46, 0x2e, 0x84, 0x28, 0x4a, 0xed, 0x33, 0xad, 0xfd, 0x9b, + 0xfd, 0x99, 0xd8, 0x17, 0x89, 0xe0, 0x42, 0x71, 0x39, 0xba, + 0xa1, 0xfe, 0x23, 0xf3, 0x88, 0x95, 0xc3, 0x3a, 0xb8, 0x5b, + 0x0b, 0x5c, 0x0c, 0x23, 0x64, 0x4b, 0xe9, 0x0d, 0x0f, 0x33, + 0xd2, 0x20, 0x0d, 0x1f, 0x7a, 0x39, 0x2d, 0x8f, 0xc5, 0xd1, + 0x48, 0x85, 0xba, 0x06, 0x4c, 0xbe, 0xd4, 0xfb, 0x8d, 0x6a, + 0xd3, 0xe5, 0x3b, 0xeb, 0xa0, 0xfa, 0x1b, 0x30, 0xd3, 0x08, + 0x26, 0x2e, 0x0b, 0x11, 0x43, 0x46, 0xba, 0x8d, 0x4d, 0xf2, + 0x6e, 0xc0, 0x99, 0x63, 0xc8, 0x54, 0x0f, 0xbe, 0xcd, 0xd0, + 0x65, 0xc0, 0xad, 0x19, 0xcd, 0xdb, 0x9b, 0x29, 0x3a, 0xef, + 0xc6, 0x0a, 0x0d, 0xd1, 0xe7, 0x0a, 0xbe, 0x6d, 0xb9, 0x46, + 0x76, 0xdc, 0x51, 0x63, 0xbf, 0x08, 0x41, 0x47, 0xbc, 0x30, + 0x50, 0x00, 0xf8, 0xc6, 0x92, 0x30, 0xec, 0xf0, 0x31, 0xa2, + 0x4e, 0xac, 0xc8, 0x63, 0x14, 0x05, 0x62, 0x22, 0x5e, 0x29, + 0x6b, 0x5e, 0x73, 0xdb, 0x7e, 0x5a, 0xf8, 0x9f, 0xd6, 0xfc, + 0x05, 0x50, 0x14, 0xa9, 0xf3, 0x66, 0xe5, 0x92, 0xad, 0x1f, + 0xe4, 0x65, 0x8b, 0x3b, 0xf5, 0x46, 0x13, 0x71, 0x19, 0x2a, + 0xc2, 0xb4, 0x51, 0xb8, 0xd7, 0x99, 0x23, 0xe5, 0x2b, 0xb2, + 0xfc, 0xb9, 0x1e, 0xd9, 0x5a, 0x3c, 0x7d, 0x4d, 0x33, 0x61, + 0x2a, 0xdf, 0x1a, 0xb5, 0x1f, 0x6a, 0x82, 0x9b, 0xcc, 0xe0, + 0x90, 0x6a, 0x6d, 0x41, 0x11, 0xe7, 0x5c, 0xf1, 0x09, 0x3d, + 0x04, 0xe0, 0x61, 0x64, 0xba, 0x01, 0x89, 0xa4, 0xee, 0x49, + 0x6f, 0x16, 0x8d, 0x72, 0x51, 0x87, 0x1a, 0x56, 0x88, 0x5a, + 0x3c, 0x8f, 0xc6, 0xe9, 0xc8, 0xb2, 0xf4, 0x3b, 0x56, 0x19, + 0x8c, 0x44, 0x04, 0xe4, 0xa6, 0xb2, 0x05, 0xdd, 0x2b, 0xc0, + 0x26, 0x92, 0x6a, 0xe0, 0xe9, 0x43, 0xb4, 0xd6, 0x31, 0x03, + 0xfc, 0xd8, 0x60, 0xd8, 0xbe, 0x5d, 0x09, 0x5b, 0xea, 0x2e, + 0x3d, 0x19, 0x6a, 0x0b, 0xdb, 0xfc, 0x1a, 0x4c, 0xe7, 0xc4, + 0xe0, 0x3a, 0xed, 0xa3, 0xf3, 0x05, 0x29, 0x59, 0xf7, 0x49, + 0x45, 0x09, 0x82, 0xb9, 0x54, 0x64, 0x61, 0x5e, 0x73, 0x03, + 0xa5, 0xe1, 0x5a, 0x7f, 0x27, 0x3f, 0x9c, 0xf6, 0x11, 0x38, + 0xa3, 0x09, 0xc1, 0x2e, 0x2b, 0xb4, 0xed, 0x5a, 0xc9, 0x4b, + 0x9b, 0x4b, 0x9a, 0xfa, 0x73, 0xe5, 0xa1, 0xc2, 0x9c, 0xfb, + 0x7f, 0x15, 0x54, 0xc0, 0xfe, 0x13, 0xa6, 0x1d, 0x0a, 0xed, + 0xb2, 0x79, 0xcb, 0xcc, 0x04, 0xb8, 0x05, 0x99, 0x62, 0x98, + 0x43, 0xe8, 0x7d, 0xd0, 0xd1, 0xb1, 0xc1, 0x81, 0xb2, 0xe1, + 0x17, 0x2e, 0xd4, 0x54, 0x90, 0x65, 0xf9, 0x5b, 0xc4, 0xc8, + 0xee, 0x9a, 0x14, 0x0a, 0x72, 0x65, 0xa5, 0x3c, 0xc5, 0x2d, + 0xd5, 0xc1, 0x7d, 0x3f, 0x48, 0xd5, 0x41, 0xe6, 0x1e, 0x55, + 0x53, 0xa6, 0x7e, 0x3d, 0xd0, 0xe5, 0xf9, 0xb2, 0x55, 0x2b, + 0x21, 0xad, 0xac, 0x18, 0xdf, 0x7a, 0x3a, 0xc4, 0x5e, 0x62, + 0x94, 0x0b, 0x0d, 0xfd, 0x33, 0x0b, 0x59, 0x53, 0xaa, 0xbd, + 0xc6, 0x26, 0x6f, 0xae, 0xb8, 0x29, 0x43, 0xc4, 0x6a, 0x61, + 0x2d, 0x7f, 0x3e, 0x39, 0x81, 0xd0, 0xad, 0x34, 0xa9, 0x50, + 0x69, 0x46, 0x07, 0x69, 0xf3, 0xd3, 0x74, 0x74, 0x65, 0xb1, + 0x5d, 0x90, 0x8c, 0xb6, 0x39, 0xd5, 0xd1, 0x1b, 0x73, 0xd7, + 0xd4, 0x5d, 0xbb, 0x81, 0x37, 0x86, 0x29, 0x6b, 0x81, 0x1b, + 0x56, 0xb2, 0x90, 0x35, 0xc1, 0xc1, 0x8c, 0x64, 0x59, 0x3a, + 0x6b, 0xa2, 0x35, 0x33, 0x77, 0xf4, 0x14, 0xda, 0xbd, 0x92, + 0x3b, 0xa0, 0x9c, 0x68, 0xce, 0xb9, 0x6e, 0x58, 0x5a, 0x56, + 0x93, 0x26, 0x41, 0x0b, 0x23, 0x18, 0x35, 0x56, 0x6e, 0x1e, + 0x87, 0x16, 0x81, 0xca, 0x03, 0x5e, 0x43, 0x94, 0xec, 0xc5, + 0xdd, 0xb9, 0x81, 0xcd, 0x43, 0x54, 0x91, 0x1a, 0x1f, 0x99, + 0x83, 0x21, 0xb2, 0xf2, 0x13, 0x3e, 0x23, 0x1b, 0x43, 0xd8, + 0x51, 0x51, 0xe2, 0x06, 0x10, 0xc8, 0xb9, 0x46, 0x8e, 0x20, + 0x50, 0xa2, 0x6a, 0x5e, 0x2d, 0xa7, 0xf1, 0xa5, 0x47, 0x6c, + 0x65, 0x2f, 0xd4, 0xc9, 0x76, 0xe1, 0xef, 0xd7, 0x53, 0x21, + 0x52, 0x28, 0xc8, 0x3d, 0x61, 0x77, 0x11, 0x01, 0x71, 0x3b, + 0xdc, 0xac, 0x7f, 0xf8, 0xb0, 0xc0, 0x69, 0xda, 0xb1, 0x61, + 0x94, 0x61, 0x37, 0x96, 0xc7, 0xc7, 0x5b, 0xdd, 0xfb, 0x7f, + 0x00, 0xe0, 0x1f, 0xb4, 0x80, 0x74, 0xf9, 0x30, 0x97, 0xc9, + 0x12, 0xc5, 0x55, 0x82, 0x9b, 0xbf, 0xf0, 0x1d, 0xc7, 0x93, + 0x5a, 0x32, 0xc6, 0xf5, 0x7c, 0xbd, 0xfe, 0x67, 0x8b, 0x68, + 0x3d, 0x70, 0xd8, 0x24, 0x71, 0xc2, 0x01, 0x2a, 0x44, 0x93, + 0x84, 0x9d, 0x91, 0x69, 0xb4, 0x6d, 0xa7, 0x7a, 0xab, 0x12, + 0xcf, 0xab, 0xaf, 0x7c, 0x7b, 0x75, 0x53, 0xc9, 0x14, 0xb0, + 0x04, 0x9f, 0x40, 0x72, 0x96, 0xf8, 0x95, 0xb4, 0xda, 0x3e, + 0x1b, 0xf7, 0x3c, 0x91, 0x8d, 0x18, 0x52, 0x62, 0x2a, 0x4f, + 0xf4, 0x55, 0x7b, 0x4d, 0x15, 0x59, 0x71, 0xed, 0xf4, 0x25, + 0xf8, 0x26, 0x3f, 0xc6, 0x8f, 0x5b, 0x1d, 0x96, 0x3a, 0x13, + 0x24, 0x08, 0x5f, 0x5b, 0x69, 0x27, 0x5a, 0xbb, 0xb0, 0x86, + 0xdc, 0x84, 0x9d, 0x4a, 0x6b, 0x41, 0xd3, 0x87, 0x57, 0x08, + 0x1f, 0x22, 0x32, 0xc1, 0x7f, 0x2f, 0x84, 0xcb, 0xba, 0x86, + 0x27, 0x87, 0xa6, 0xa4, 0x1a, 0x99, 0xb5, 0xa5, 0xa1, 0xc3, + 0xa9, 0xa4, 0x03, 0x45, 0xc8, 0x2a, 0x94, 0xdf, 0x5c, 0x3b, + 0xc0, 0x72, 0x85, 0xf3, 0xe0, 0x1f, 0xad, 0x44, 0x66, 0x85, + 0xdd, 0x7b, 0xf3, 0x3a, 0x59, 0xd7, 0x81, 0x27, 0x65, 0xcf, + 0xba, 0xf0, 0x74, 0xfd, 0xf5, 0xf5, 0x12, 0xce, 0x31, 0xdf, + 0xf1, 0xdf, 0x22, 0x93, 0x6d, 0x23, 0x7f, 0xa3, 0xa7, 0x54, + 0xa0, 0x05, 0x10, 0xe5, 0x2d, 0x49, 0x14, 0x8f, 0x99, 0xe1, + 0xf5, 0x8e, 0x51, 0xe0, 0xe3, 0xcf, 0x56, 0x2f, 0x6e, 0xae, + 0xa5, 0x91, 0x4e, 0xe2, 0x73, 0x4c, 0xea, 0xc2, 0x25, 0x6a, + 0x8c, 0x6a, 0xb6, 0x26, 0x88, 0x28, 0xb5 +}; + diff --git a/neureka/dense/src/output.c b/neureka/dense/src/output.c new file mode 100644 index 0000000..ab9b752 --- /dev/null +++ b/neureka/dense/src/output.c @@ -0,0 +1,46 @@ +#include "output.h" + +#define OUTPUT_SIZE (175) +PI_L1 uint8_t output[OUTPUT_SIZE]; + +#define GOLDEN_OUTPUT_SIZE (175) +PI_L2 uint8_t golden_output[GOLDEN_OUTPUT_SIZE] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0xa0, 0x00, 0xdc, + 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x43, + 0x2d, 0x28, 0x00, 0xba, 0xad, 0x00, 0x00, 0x00, 0x05, 0x80, + 0x00, 0x1b, 0x00, 0x1a, 0xa1, 0x00, 0x6f, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x6b, 0x00, 0x96, 0x00, 0x00, 0x00, 0x00, 0x28, + 0x00, 0x1e, 0xff, 0x51, 0x35, 0x0f, 0x00, 0x00, 0x00, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x1f, 0x00, + 0x00, 0x00, 0x00, 0x52, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xba, 0x06, 0x1b, + 0x13, 0x00, 0x00, 0xa6, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, + 0x00, 0x00, 0x22, 0x26, 0xa2, 0x00, 0x44, 0x00, 0x0c, 0x00, + 0x3a, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x00, 0x84, 0x00, 0x00, + 0x00, 0x00, 0xec, 0x03, 0x62, 0x2c, 0x00, 0x00, 0x00, 0xa9, + 0x00, 0x00, 0x00, 0x01, 0x09, 0x00, 0x0b, 0x00, 0x25, 0xa0, + 0x00, 0x00, 0x00, 0x13, 0x00, 0x17, 0x00, 0x00, 0x00, 0xff, + 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0xff, 0x00, 0x78, + 0x17, 0x04, 0x00, 0xdc, 0x76, 0x00, 0x00, 0x00, 0x00, 0xcd, + 0x00, 0x00, 0x06, 0x1b, 0x48 +}; + +int check_output() { + printf("Checking the output vector:\n"); + + int n_err = 0; + for (int i = 0; i < OUTPUT_SIZE; i++) { + if (output[i] != golden_output[i]) { + printf("ERROR: wrong value of output @ %d: %d vs. golden: %d\n", i, output[i], golden_output[i]); + n_err++; + } + } + + if (n_err == 0) + printf("> Success! No errors found.\n"); + else + printf("> Failure! Found %d/%d errors.\n", n_err, OUTPUT_SIZE); + return n_err; + } + + \ No newline at end of file diff --git a/neureka/dense/src/scale.c b/neureka/dense/src/scale.c new file mode 100644 index 0000000..33769cd --- /dev/null +++ b/neureka/dense/src/scale.c @@ -0,0 +1,10 @@ +#include "scale.h" + +#define SCALE_SIZE (35) +PI_L1 uint8_t scale[SCALE_SIZE] = { + 0x06, 0x07, 0x14, 0x06, 0x10, 0x0b, 0x1d, 0x14, 0x01, 0x1b, + 0x03, 0x04, 0x17, 0x1c, 0x05, 0x13, 0x06, 0x0f, 0x0c, 0x0c, + 0x02, 0x04, 0x0e, 0x19, 0x0b, 0x1a, 0x12, 0x10, 0x06, 0x0b, + 0x09, 0x16, 0x06, 0x01, 0x0c +}; + diff --git a/neureka/dense/src/weight.c b/neureka/dense/src/weight.c new file mode 100644 index 0000000..811eb11 --- /dev/null +++ b/neureka/dense/src/weight.c @@ -0,0 +1,3030 @@ +#include "weight.h" + +#define WEIGHT_SIZE (30240) +PI_L1 uint8_t weight[WEIGHT_SIZE] = { + 0xfa, 0x5e, 0x7d, 0x0e, 0x0d, 0x0d, 0x03, 0xc1, 0x56, 0x6a, + 0x95, 0x9e, 0x1e, 0xdf, 0x88, 0x4d, 0xbe, 0x88, 0xc4, 0x67, + 0x6e, 0x2e, 0x9e, 0x60, 0xcc, 0xb4, 0x25, 0x04, 0xe5, 0xfc, + 0x82, 0x6b, 0x1e, 0x34, 0x8a, 0x08, 0x47, 0xda, 0xe8, 0xeb, + 0x39, 0x54, 0x10, 0x81, 0x36, 0x76, 0x53, 0xa3, 0x87, 0x45, + 0x86, 0xd4, 0x9b, 0xb1, 0x58, 0xb8, 0xad, 0xa8, 0x47, 0x0a, + 0x0c, 0xde, 0x20, 0xc5, 0xae, 0xf5, 0xaa, 0xaf, 0x29, 0x92, + 0x48, 0x40, 0xd2, 0x79, 0x9d, 0x5c, 0xa5, 0x8e, 0x6d, 0x1a, + 0x0b, 0x12, 0xf4, 0x03, 0xa3, 0x1f, 0x45, 0x3c, 0x16, 0xd5, + 0xb6, 0xe1, 0x25, 0xce, 0x4a, 0x1d, 0xe4, 0xc8, 0xd6, 0xda, + 0x54, 0xfa, 0x6c, 0x63, 0x35, 0x4b, 0x0b, 0x3a, 0x84, 0xe5, + 0x8c, 0x89, 0x18, 0xb9, 0x94, 0x6d, 0x01, 0xf2, 0x5d, 0x39, + 0xc0, 0x2d, 0x4d, 0x97, 0x42, 0xaa, 0x49, 0x6b, 0x2b, 0xa0, + 0xd4, 0x4b, 0x06, 0xeb, 0x48, 0xab, 0xc5, 0x11, 0x2b, 0xf8, + 0xb9, 0xc8, 0xa7, 0xb0, 0xe5, 0xe8, 0x69, 0x5b, 0xe8, 0xaf, + 0xdf, 0xa2, 0x32, 0x3a, 0xc6, 0x20, 0x86, 0xc9, 0x2d, 0x81, + 0xa3, 0xd2, 0x07, 0xd2, 0x43, 0x24, 0xd8, 0x02, 0x0c, 0x37, + 0x48, 0x6f, 0x5f, 0x83, 0x47, 0x76, 0xd5, 0xf6, 0xe6, 0xb1, + 0xef, 0xcb, 0x5a, 0x39, 0xc0, 0x85, 0x0e, 0xa9, 0x30, 0x39, + 0x75, 0x01, 0x06, 0xc7, 0x29, 0x35, 0xa3, 0x98, 0x56, 0xd9, + 0x40, 0x54, 0xe8, 0x0d, 0x4f, 0xb7, 0x68, 0xfb, 0x57, 0x82, + 0x2f, 0x56, 0x91, 0x70, 0xe2, 0xab, 0xe7, 0xcb, 0x5a, 0x19, + 0xc0, 0xa5, 0x0f, 0xa9, 0xb0, 0x39, 0x75, 0x01, 0x06, 0xcf, + 0x2d, 0xb5, 0xa3, 0x98, 0x56, 0xd9, 0x49, 0x54, 0xe8, 0x0b, + 0x4c, 0x37, 0x68, 0xfb, 0x57, 0x83, 0x2f, 0x56, 0x91, 0x70, + 0xe2, 0xbb, 0x18, 0x34, 0xa5, 0xe6, 0x3f, 0x5a, 0xf0, 0x56, + 0xcf, 0xc6, 0x8a, 0xfe, 0xf9, 0x30, 0xd2, 0x4a, 0x5c, 0x67, + 0xa9, 0x26, 0xb6, 0xab, 0x17, 0xf4, 0xb3, 0xc8, 0x97, 0x04, + 0xa8, 0x7c, 0xd0, 0xa9, 0x6e, 0x8f, 0x1d, 0x44, 0xe0, 0x3e, + 0x07, 0xbc, 0x00, 0x64, 0x58, 0x14, 0x26, 0xc1, 0x52, 0x73, + 0x88, 0xf8, 0x22, 0xa9, 0x3c, 0x42, 0x0d, 0x4a, 0xec, 0xf8, + 0xc2, 0xd7, 0x29, 0x6d, 0x02, 0x04, 0xc4, 0x15, 0x40, 0x12, + 0xe1, 0xc4, 0x5c, 0x34, 0x8f, 0x46, 0x86, 0xbc, 0x93, 0xf4, + 0xeb, 0x2d, 0xcb, 0x67, 0xb6, 0x56, 0x32, 0x8c, 0xe0, 0x87, + 0x3e, 0x17, 0x06, 0xe6, 0x57, 0xf9, 0xfc, 0x7f, 0x6d, 0x3d, + 0x08, 0x83, 0x1c, 0x36, 0x47, 0x2b, 0x55, 0xf5, 0x89, 0x66, + 0xcf, 0xf5, 0x5a, 0x34, 0x95, 0x7b, 0x6f, 0xfa, 0x1a, 0x5b, + 0x1b, 0x05, 0x62, 0x8d, 0x0a, 0x21, 0x83, 0x83, 0x22, 0x3c, + 0x39, 0x47, 0x24, 0x17, 0x8f, 0xea, 0xce, 0x2d, 0xdf, 0x47, + 0x86, 0x49, 0x4f, 0xba, 0x0e, 0x2f, 0xa5, 0xa5, 0x1b, 0x3d, + 0xc6, 0x20, 0x4f, 0xf0, 0x86, 0x62, 0x47, 0xed, 0x22, 0x19, + 0x71, 0xad, 0x27, 0xaf, 0xe9, 0x56, 0x93, 0xf5, 0x5c, 0x47, + 0xff, 0xce, 0xbf, 0x2a, 0xa8, 0x95, 0x9a, 0xb5, 0x17, 0xc1, + 0xa6, 0x3b, 0x6e, 0xd9, 0x6a, 0xde, 0xf3, 0x7d, 0xd8, 0x33, + 0xca, 0x0d, 0xa9, 0xd2, 0x48, 0x38, 0x3b, 0x84, 0x48, 0x60, + 0xb7, 0x2c, 0x8a, 0x85, 0xb8, 0xcb, 0x01, 0xe5, 0xe5, 0x94, + 0xbf, 0xb4, 0x5b, 0x60, 0x98, 0xfb, 0xe6, 0xae, 0x45, 0xf1, + 0x6b, 0xbe, 0x65, 0x49, 0x54, 0x33, 0xd7, 0x32, 0x2f, 0xc0, + 0x5c, 0x2a, 0x31, 0xa7, 0x0c, 0xe6, 0xaf, 0x2a, 0x9f, 0x64, + 0x7c, 0x57, 0xab, 0xbb, 0xa7, 0x15, 0xbc, 0x92, 0x84, 0x2f, + 0x81, 0x87, 0xe3, 0x8f, 0x45, 0xf1, 0x6b, 0xbe, 0x65, 0x69, + 0x5c, 0x33, 0xc6, 0x32, 0x2f, 0xc0, 0x5c, 0x28, 0x39, 0xa5, + 0x0c, 0xe4, 0xaf, 0x2e, 0x9b, 0x64, 0x78, 0x47, 0xab, 0xbf, + 0xe7, 0x15, 0xbc, 0x94, 0xc0, 0x2f, 0x90, 0x87, 0xe3, 0x8f, + 0xba, 0x0e, 0x94, 0x41, 0x9a, 0x96, 0xa3, 0xcc, 0x39, 0xcd, + 0xd0, 0x3f, 0xa3, 0xd7, 0xc6, 0x5a, 0xf3, 0x1b, 0x50, 0xd1, + 0x64, 0x9b, 0x87, 0xb8, 0x54, 0x40, 0x18, 0xea, 0x43, 0x6b, + 0x3f, 0xd0, 0x6f, 0x78, 0x1c, 0x70, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0xfd, 0x34, 0x94, 0x6f, 0x49, 0x64, + 0x1b, 0xa1, 0xe6, 0x63, 0x32, 0x22, 0xb6, 0xbd, 0xf8, 0x2e, + 0xe9, 0x90, 0x26, 0xe4, 0xd1, 0x61, 0x06, 0x16, 0x02, 0x9c, + 0x3f, 0xb4, 0xc2, 0x90, 0x37, 0x4c, 0xdc, 0xbf, 0x0e, 0xf0, + 0x93, 0x83, 0x4c, 0x08, 0xfd, 0x5a, 0x0f, 0x36, 0xec, 0x00, + 0x02, 0x9d, 0xb8, 0xcf, 0xfa, 0x1e, 0x66, 0xfc, 0x57, 0x69, + 0x08, 0xaa, 0x8a, 0x14, 0x6d, 0x0b, 0x05, 0xc9, 0x9a, 0xb5, + 0xd9, 0xf6, 0x16, 0x84, 0xb7, 0xae, 0x0e, 0xd9, 0x94, 0x25, + 0xce, 0x28, 0xde, 0xbf, 0x96, 0xb2, 0xa5, 0x89, 0x31, 0xa3, + 0xd8, 0x32, 0xe6, 0x63, 0x4c, 0x89, 0x37, 0xd7, 0x84, 0x2e, + 0x4a, 0x5f, 0x4b, 0xcb, 0x0c, 0x0c, 0x51, 0x4d, 0x94, 0x97, + 0x6a, 0xc4, 0x02, 0xf8, 0x74, 0xee, 0xaa, 0x26, 0x41, 0xc2, + 0xbd, 0xc2, 0xe5, 0x68, 0x98, 0xfd, 0x8e, 0xe2, 0x90, 0x24, + 0xe4, 0x1b, 0x25, 0x4a, 0xbe, 0xd3, 0xde, 0xad, 0x63, 0x39, + 0x76, 0x6b, 0x4a, 0xbe, 0xbf, 0x16, 0xc9, 0xb2, 0xcb, 0x32, + 0x3d, 0x82, 0xf6, 0xaa, 0xec, 0x36, 0x21, 0x4d, 0x0b, 0x87, + 0x87, 0x90, 0x2c, 0x82, 0x47, 0xbb, 0x5c, 0xd0, 0x9b, 0xc5, + 0x19, 0xb8, 0x3b, 0xa6, 0xfd, 0x00, 0x16, 0x9a, 0x84, 0xa6, + 0xcc, 0x84, 0xd6, 0x44, 0x1a, 0xe8, 0x0d, 0xfb, 0xb7, 0x8b, + 0x6c, 0x32, 0x20, 0x4c, 0x03, 0x3f, 0x83, 0xc8, 0xa2, 0xa2, + 0x65, 0x2f, 0x2d, 0x72, 0xbf, 0xc5, 0x19, 0xaa, 0x7b, 0x8c, + 0xae, 0x01, 0x5b, 0x9e, 0xcc, 0xce, 0x9c, 0xb6, 0xc4, 0xf4, + 0x9a, 0xe8, 0x0d, 0xea, 0xb7, 0x8a, 0x6c, 0x32, 0x20, 0x4c, + 0x03, 0x3e, 0x87, 0xd8, 0xa2, 0xa2, 0x65, 0x2f, 0x2c, 0x72, + 0xbf, 0xc5, 0x19, 0xba, 0x7b, 0x8c, 0xef, 0x01, 0x5b, 0x9a, + 0xcc, 0xce, 0x9c, 0xb6, 0xc4, 0xf4, 0x65, 0x17, 0xf2, 0x15, + 0x48, 0x75, 0x93, 0xcd, 0xdf, 0xb3, 0xfc, 0xc1, 0x78, 0x27, + 0x5d, 0x5d, 0x9a, 0xd0, 0xd3, 0x8d, 0x40, 0x3a, 0xe6, 0x45, + 0x84, 0x73, 0x10, 0xfe, 0xa4, 0x65, 0x33, 0x31, 0x63, 0x49, + 0x3b, 0x0b, 0xd1, 0xe4, 0x92, 0xa8, 0x5e, 0x6f, 0x63, 0xa3, + 0x75, 0xb2, 0xa0, 0x2d, 0xaf, 0x91, 0x08, 0x93, 0x07, 0xff, + 0xd8, 0x41, 0xe7, 0x59, 0x5d, 0xad, 0xd4, 0x72, 0xc4, 0x0d, + 0x48, 0x2b, 0xfc, 0xc2, 0xa0, 0x27, 0xdf, 0x1c, 0x6b, 0xcb, + 0x67, 0xc3, 0x6e, 0x6f, 0xf7, 0xf5, 0x85, 0x76, 0x87, 0xec, + 0xd9, 0xb0, 0x1f, 0xdd, 0xe5, 0xa6, 0x32, 0x8a, 0x6c, 0xdb, + 0x74, 0xe0, 0x58, 0x5e, 0xce, 0x0e, 0xa2, 0x69, 0xa4, 0x5d, + 0x7e, 0x26, 0x1f, 0x16, 0x22, 0xbd, 0x01, 0x57, 0x83, 0x76, + 0x5c, 0x81, 0x35, 0x06, 0x15, 0xcd, 0x59, 0xf3, 0x3b, 0xbf, + 0x03, 0xaa, 0x51, 0xd7, 0x74, 0xf8, 0x8c, 0x8c, 0xd7, 0x5e, + 0x4a, 0xa1, 0x3f, 0x3b, 0x3f, 0xca, 0x38, 0xd5, 0xf3, 0x7b, + 0x65, 0x34, 0xf7, 0x9a, 0x58, 0x66, 0xe8, 0xc6, 0xb1, 0x3b, + 0x4d, 0x19, 0xfe, 0xcb, 0xd7, 0xb2, 0xa2, 0xdf, 0x51, 0x59, + 0xb1, 0x73, 0x2e, 0xd5, 0x92, 0xfc, 0x05, 0x60, 0xe0, 0x6a, + 0x2e, 0x7d, 0x12, 0x2d, 0x66, 0xfc, 0x28, 0xf2, 0x44, 0x64, + 0xd4, 0xac, 0x8a, 0x94, 0x4f, 0x80, 0xc7, 0x0d, 0xaa, 0xcd, + 0xc1, 0x90, 0xd5, 0x43, 0x2a, 0xbc, 0x91, 0xd6, 0x30, 0x89, + 0xea, 0x79, 0x48, 0xcb, 0xa0, 0xb0, 0x4f, 0x20, 0xc1, 0x7d, + 0xd1, 0x70, 0x65, 0xfa, 0xc6, 0x70, 0xca, 0x26, 0x93, 0x1a, + 0x3a, 0x33, 0xc2, 0x49, 0x3a, 0x01, 0x11, 0xb2, 0x9e, 0x4b, + 0xa4, 0x78, 0x7c, 0xf5, 0x3c, 0xd4, 0xc6, 0x71, 0x40, 0xc2, + 0x62, 0xea, 0xcf, 0x63, 0x11, 0xfd, 0x8a, 0xf8, 0x65, 0xfa, + 0x46, 0x70, 0xca, 0xa6, 0x92, 0x12, 0x3a, 0x22, 0xc6, 0x49, + 0x2a, 0x01, 0x11, 0xb2, 0x9e, 0x4b, 0xa0, 0x78, 0x7c, 0xf5, + 0x3c, 0xd4, 0xc6, 0x71, 0x40, 0xc2, 0xe2, 0xea, 0xcf, 0x63, + 0x11, 0xfd, 0x82, 0xf8, 0x9a, 0x05, 0xb9, 0x8f, 0x35, 0x59, + 0x6d, 0xed, 0xc5, 0xdd, 0x39, 0xb6, 0xd5, 0xfe, 0xee, 0x4d, + 0x61, 0xb4, 0x5f, 0x87, 0x83, 0x0a, 0xc3, 0x2b, 0x39, 0x8e, + 0xbf, 0x3d, 0x1d, 0x15, 0x30, 0x9c, 0xee, 0x02, 0x7d, 0x07, + 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe7, 0xf1, + 0xe5, 0x0b, 0xa8, 0xa0, 0x23, 0x89, 0xee, 0xe0, 0x4c, 0x00, + 0x99, 0xe0, 0xeb, 0xdd, 0x41, 0x87, 0x83, 0x61, 0x4c, 0x85, + 0xcb, 0x5d, 0x7a, 0xfe, 0xe6, 0x2c, 0x49, 0x58, 0xc0, 0xc1, + 0xb3, 0xbe, 0xdc, 0xd4, 0xce, 0xbe, 0x81, 0x22, 0xc2, 0x79, + 0x13, 0x7c, 0xd4, 0x38, 0xaa, 0x8d, 0x91, 0x64, 0xbb, 0x29, + 0xd4, 0xff, 0x36, 0x9c, 0x66, 0xc0, 0x19, 0x0d, 0x36, 0x82, + 0x49, 0x39, 0xfd, 0xd8, 0xf7, 0xa0, 0xd4, 0xa8, 0xa7, 0x44, + 0xdd, 0x66, 0x47, 0xd9, 0x12, 0x58, 0x50, 0x11, 0x5c, 0x9b, + 0x0c, 0xd5, 0x06, 0xb0, 0xab, 0xac, 0x44, 0xb3, 0xb5, 0xb9, + 0x7f, 0x03, 0x67, 0x35, 0x7b, 0xed, 0xe2, 0xb0, 0xf4, 0xbc, + 0x42, 0xf1, 0x60, 0x64, 0x26, 0x22, 0xc3, 0xf3, 0x20, 0x89, + 0x94, 0xcc, 0x12, 0xcf, 0x3d, 0x25, 0x3c, 0xc3, 0x57, 0xa1, + 0xd9, 0x8e, 0xfc, 0x32, 0xd8, 0xc8, 0x95, 0x72, 0xb2, 0xd9, + 0xf0, 0xbc, 0x72, 0x61, 0x54, 0x64, 0xed, 0xce, 0x34, 0x7b, + 0x38, 0xf8, 0xd6, 0x11, 0xce, 0x37, 0x0a, 0xd0, 0x6e, 0x84, + 0xae, 0xa6, 0xfa, 0x88, 0xb2, 0x77, 0x28, 0x57, 0x0c, 0x80, + 0xc2, 0x7e, 0xe1, 0xc9, 0x03, 0xd5, 0x6f, 0xa6, 0xfc, 0x1a, + 0xee, 0x64, 0x7b, 0x27, 0x6f, 0xf0, 0x73, 0x98, 0x33, 0xc3, + 0xe3, 0x1c, 0xba, 0xd8, 0x24, 0xd4, 0x7c, 0xb7, 0xfc, 0x8b, + 0xf2, 0xe0, 0x81, 0x5d, 0x26, 0x9c, 0xf6, 0xf9, 0x39, 0x43, + 0x13, 0xdc, 0x7f, 0xa2, 0x75, 0x81, 0xfc, 0xa4, 0x79, 0x45, + 0x2f, 0x83, 0x76, 0x74, 0x33, 0xc3, 0xe7, 0x1c, 0xba, 0xd8, + 0x24, 0x54, 0x7c, 0xa7, 0xfc, 0x8b, 0xf2, 0xe0, 0x81, 0x5d, + 0x26, 0x92, 0xf6, 0xf8, 0x31, 0x43, 0x13, 0xdc, 0x7f, 0xa2, + 0x74, 0x81, 0xfc, 0x24, 0x79, 0x45, 0x2f, 0xb3, 0x76, 0xf4, + 0xcc, 0x3c, 0x18, 0xe3, 0x45, 0x27, 0xdb, 0x2b, 0x83, 0x58, + 0x03, 0x74, 0x0d, 0x1f, 0x7e, 0xa2, 0xd9, 0x6d, 0x09, 0x07, + 0xce, 0xbc, 0xec, 0x23, 0x80, 0x5d, 0x8b, 0x7e, 0x03, 0xdb, + 0x86, 0xba, 0xd0, 0x4c, 0x89, 0x0b, 0x15, 0xd9, 0x6c, 0x4d, + 0xa7, 0xc9, 0x3c, 0x21, 0xe9, 0x81, 0xdb, 0xba, 0x86, 0xbf, + 0xb7, 0x72, 0x99, 0xf6, 0x0c, 0xe0, 0xf9, 0x8b, 0x46, 0x87, + 0x1a, 0xdd, 0xd5, 0x0e, 0x1c, 0x00, 0x19, 0x56, 0xbe, 0xfe, + 0xa6, 0x6c, 0xdb, 0xe8, 0x2c, 0x4e, 0x09, 0x19, 0x79, 0x66, + 0x9b, 0x44, 0xe7, 0xb3, 0x7d, 0x70, 0x3f, 0x2b, 0x7b, 0xe1, + 0x65, 0xf3, 0x3f, 0x30, 0x0b, 0x42, 0x5e, 0x90, 0x23, 0x62, + 0x98, 0x4d, 0x10, 0x90, 0x92, 0x62, 0xb8, 0x31, 0x2c, 0x9b, + 0x15, 0xb8, 0x38, 0x53, 0x89, 0x9f, 0xdd, 0xff, 0x79, 0x16, + 0xd7, 0x54, 0xc5, 0x7c, 0x51, 0xd5, 0xdf, 0x9b, 0xa0, 0x1a, + 0xf5, 0x15, 0x95, 0x2d, 0x0c, 0xc4, 0xcb, 0x48, 0x5c, 0xf0, + 0x2b, 0x13, 0xa8, 0x00, 0xda, 0x48, 0xf8, 0x96, 0x3a, 0x81, + 0x7b, 0x53, 0x91, 0x49, 0xe8, 0x83, 0xba, 0x8c, 0x59, 0xca, + 0x12, 0xe0, 0x23, 0xf4, 0x5b, 0xd7, 0xb5, 0xff, 0xc2, 0xc5, + 0xa0, 0xb9, 0xed, 0xa3, 0x5e, 0x9c, 0x9e, 0x37, 0x66, 0xa1, + 0x8b, 0xd5, 0x3e, 0xfd, 0x0a, 0x15, 0x3f, 0xb5, 0x71, 0x3c, + 0x3b, 0xbf, 0xff, 0xde, 0xe4, 0x8b, 0x6a, 0xf1, 0x34, 0x35, + 0x9a, 0x32, 0xe5, 0xd2, 0x6b, 0x10, 0xdf, 0xe7, 0x8b, 0xaf, + 0x5a, 0x5c, 0x23, 0xc9, 0x07, 0xa6, 0x73, 0x71, 0xe2, 0x96, + 0x9f, 0x14, 0x9a, 0xf5, 0xb0, 0xa4, 0x1d, 0x13, 0xef, 0xff, + 0x64, 0x3b, 0xf4, 0xf9, 0xb8, 0x31, 0x9b, 0x71, 0xf1, 0x52, + 0xfe, 0x92, 0xdf, 0xe7, 0x81, 0xe7, 0xf6, 0x71, 0x2e, 0xca, + 0x1f, 0xa2, 0x53, 0x71, 0xfa, 0x96, 0x9f, 0x14, 0x9a, 0xf5, + 0xb1, 0xa4, 0x1d, 0x13, 0xef, 0xfe, 0x64, 0x3a, 0xf0, 0xf9, + 0x38, 0x31, 0x9b, 0x71, 0xf5, 0x52, 0xfa, 0x92, 0xdf, 0xe3, + 0x81, 0xa7, 0x76, 0x75, 0x2a, 0xc2, 0x1f, 0xa6, 0xac, 0x8e, + 0x05, 0x69, 0x60, 0xeb, 0x65, 0x0a, 0x4e, 0x5b, 0xe2, 0xec, + 0x10, 0x01, 0x9b, 0xc5, 0x0f, 0x06, 0xc7, 0xce, 0x64, 0x8e, + 0x0a, 0xad, 0x05, 0x6d, 0x20, 0x1c, 0x7e, 0x58, 0x89, 0x8a, + 0xd5, 0x3d, 0xe0, 0x59, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xc4, 0x5f, 0xfb, 0xa9, 0x84, 0x01, 0x52, 0x16, + 0x8c, 0xdd, 0x09, 0x32, 0xb6, 0x6b, 0xbb, 0x8f, 0xda, 0x5e, + 0x28, 0x68, 0x75, 0xbe, 0x9e, 0xcb, 0x7e, 0xf3, 0x20, 0x46, + 0xdf, 0x94, 0xfe, 0x87, 0x86, 0xd0, 0xf0, 0x44, 0x53, 0xa8, + 0x66, 0x05, 0x1c, 0xf5, 0x14, 0xfb, 0xae, 0xff, 0x1b, 0xac, + 0xfd, 0xe5, 0x68, 0x14, 0x46, 0x87, 0x11, 0x9c, 0x72, 0x2a, + 0xa9, 0x3e, 0xd9, 0x2b, 0x54, 0xfc, 0x3f, 0x82, 0x84, 0xfb, + 0xc3, 0x27, 0x8e, 0x93, 0xff, 0x3d, 0x67, 0x9e, 0x1c, 0xbc, + 0x05, 0x23, 0xf7, 0xc3, 0x16, 0x4f, 0x1e, 0x38, 0x72, 0x0c, + 0x83, 0x05, 0x1e, 0xb1, 0x64, 0x9f, 0x80, 0xd2, 0x58, 0xde, + 0x02, 0xd9, 0x6f, 0x7b, 0x8c, 0x3f, 0x74, 0x86, 0x90, 0xcf, + 0x54, 0xa1, 0x90, 0x64, 0x3b, 0x31, 0xef, 0x18, 0x11, 0xee, + 0x18, 0xb2, 0xf6, 0x85, 0x03, 0x2f, 0xb3, 0x61, 0x79, 0xb7, + 0x84, 0x2d, 0xef, 0xdc, 0xb8, 0xdb, 0x0a, 0xc9, 0xae, 0x8e, + 0x2f, 0x9b, 0x61, 0x01, 0xd0, 0x0f, 0x5f, 0xb2, 0x41, 0x00, + 0x30, 0x2c, 0x43, 0x29, 0x64, 0x47, 0xe0, 0x2c, 0x3f, 0x77, + 0xd0, 0xb3, 0x26, 0x43, 0x13, 0x00, 0x88, 0x03, 0x0e, 0x37, + 0x9b, 0x47, 0xfd, 0x50, 0xe3, 0x62, 0x6b, 0x63, 0x98, 0x53, + 0xdc, 0xd4, 0x5c, 0xb1, 0x71, 0x03, 0x1a, 0x2d, 0x8f, 0x19, + 0x27, 0xdd, 0x78, 0xa8, 0x03, 0xf5, 0x9c, 0x26, 0x80, 0xc1, + 0x11, 0x07, 0x8a, 0x2f, 0x6e, 0xf2, 0x83, 0xf1, 0x23, 0x50, + 0x61, 0x6e, 0x62, 0x61, 0x3c, 0x43, 0xd6, 0xc6, 0x5c, 0xb1, + 0x71, 0x00, 0x1a, 0x2c, 0xcf, 0x19, 0x27, 0xdf, 0x78, 0xa8, + 0x33, 0xf5, 0x90, 0x27, 0x02, 0xc1, 0x11, 0x07, 0x8a, 0x2b, + 0x4e, 0xf2, 0x83, 0xd1, 0x23, 0x50, 0x63, 0x6a, 0x26, 0x61, + 0x3c, 0x43, 0xd6, 0xc6, 0xa3, 0x4e, 0x8e, 0xff, 0xe5, 0xd3, + 0x30, 0xe6, 0xd8, 0x20, 0x87, 0x57, 0xcc, 0x0a, 0x6f, 0xd8, + 0xfd, 0x3e, 0xee, 0xf8, 0x75, 0xd4, 0xb1, 0x0d, 0x7c, 0x2e, + 0xdc, 0xaf, 0x9c, 0x95, 0xd9, 0x9e, 0xc3, 0xbc, 0x29, 0x39, + 0x6d, 0xb7, 0x3d, 0x1e, 0x6e, 0x69, 0xa5, 0xf0, 0xbf, 0x8b, + 0xe6, 0x03, 0x2f, 0x9f, 0x76, 0xe9, 0xb9, 0x2a, 0x5b, 0x16, + 0x0c, 0xee, 0x65, 0xca, 0x7f, 0x9d, 0x44, 0xc1, 0x39, 0x2e, + 0x8e, 0xbe, 0xae, 0xa6, 0x16, 0x8b, 0xe9, 0x61, 0x8a, 0xe8, + 0x93, 0xaf, 0x0f, 0x08, 0x9b, 0xbf, 0xa4, 0xec, 0xae, 0x62, + 0x3d, 0xe4, 0xab, 0x70, 0x3a, 0x2b, 0x0d, 0xd6, 0x80, 0xa8, + 0x5f, 0xf7, 0x83, 0x3a, 0xfd, 0x41, 0xdf, 0xec, 0xc6, 0x5c, + 0x32, 0xd7, 0xce, 0x7d, 0xe2, 0x8d, 0x8d, 0xb0, 0xec, 0xdc, + 0xfc, 0x16, 0x4f, 0x58, 0x5e, 0x6b, 0x34, 0x38, 0x9f, 0xf9, + 0x64, 0xdd, 0x43, 0x35, 0xdf, 0x01, 0x04, 0xa9, 0xb9, 0xd7, + 0x43, 0x89, 0x7d, 0x78, 0x05, 0x65, 0x1a, 0x13, 0x37, 0x0f, + 0x22, 0xb7, 0x8e, 0xe8, 0xe2, 0x3a, 0x42, 0xa3, 0x3e, 0x11, + 0xae, 0x4f, 0x7e, 0x15, 0xcd, 0x34, 0xc6, 0xd4, 0x53, 0xa7, + 0xdb, 0xfe, 0x0c, 0xe2, 0x82, 0x82, 0x9d, 0x52, 0x8d, 0xb1, + 0x5a, 0x61, 0xfb, 0xba, 0x77, 0xc3, 0x4f, 0xcf, 0x20, 0x12, + 0x99, 0x56, 0x67, 0x0f, 0x56, 0x8c, 0x0c, 0xbc, 0x0c, 0x25, + 0x2a, 0xbe, 0x77, 0x22, 0xc6, 0xb4, 0x50, 0xb7, 0xe2, 0x10, + 0xca, 0x2c, 0xff, 0x97, 0xe6, 0xc8, 0x58, 0x75, 0x07, 0x5c, + 0x55, 0x77, 0xd2, 0xdb, 0x30, 0x2a, 0x5e, 0x18, 0x4f, 0x5b, + 0x4e, 0x1f, 0x2d, 0x6d, 0x1e, 0x3c, 0x8e, 0x9a, 0x73, 0x4e, + 0x57, 0x85, 0x59, 0x86, 0x84, 0x30, 0x08, 0x8e, 0xf9, 0x97, + 0xca, 0x91, 0x58, 0x6d, 0x93, 0x0f, 0x75, 0x77, 0xd2, 0xdf, + 0x30, 0x2a, 0x1f, 0x18, 0x4f, 0x4b, 0x4e, 0x1f, 0x2d, 0x6d, + 0x1c, 0x3c, 0x8e, 0x9a, 0x73, 0x4e, 0x57, 0x85, 0x59, 0x86, + 0x44, 0x10, 0x0a, 0x8e, 0xfb, 0x97, 0xce, 0x91, 0x58, 0x65, + 0x13, 0x0f, 0x8a, 0x88, 0x2d, 0x20, 0xcf, 0xd5, 0xe0, 0xe7, + 0xb0, 0xb4, 0xb1, 0xe0, 0xd2, 0x92, 0xe3, 0xc3, 0x71, 0x65, + 0x8c, 0xb1, 0xa8, 0x7a, 0xa6, 0x79, 0xbb, 0xef, 0xf5, 0x71, + 0x04, 0x68, 0x31, 0x6e, 0xa7, 0x9a, 0xec, 0xf0, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x49, 0x9e, 0x07, + 0xe9, 0x01, 0x2d, 0x8a, 0xb5, 0xeb, 0x9d, 0x21, 0x31, 0xbb, + 0x4c, 0x26, 0xee, 0xda, 0xae, 0x11, 0x8f, 0x95, 0x1a, 0xa9, + 0x74, 0xae, 0xda, 0x53, 0x58, 0xd8, 0x92, 0x81, 0x55, 0xaf, + 0x14, 0x79, 0x72, 0x01, 0xff, 0xb4, 0xac, 0xc4, 0x33, 0xa8, + 0xc0, 0xea, 0xee, 0xe4, 0x8f, 0xe7, 0xfc, 0xbf, 0x3a, 0xd5, + 0x17, 0x01, 0x0d, 0xb3, 0x21, 0x52, 0x58, 0x58, 0x53, 0x44, + 0x71, 0x0b, 0xf0, 0x6d, 0x90, 0xb6, 0x1d, 0x86, 0x3c, 0x9d, + 0xa1, 0xa3, 0xa6, 0xcd, 0xfc, 0x03, 0xd3, 0x50, 0x63, 0xf0, + 0xd0, 0x0c, 0x31, 0x6e, 0x9d, 0x8c, 0xdc, 0x4d, 0xde, 0x0d, + 0x0c, 0xb2, 0x30, 0xdf, 0xe2, 0x05, 0x56, 0x09, 0x3c, 0x37, + 0x81, 0x81, 0x7c, 0x37, 0x94, 0x6f, 0x2e, 0x96, 0x18, 0xa6, + 0xe7, 0x7e, 0x50, 0xc8, 0x49, 0x51, 0x6d, 0x25, 0x1f, 0x47, + 0xae, 0x18, 0xf8, 0xc1, 0x9f, 0x12, 0xdb, 0x04, 0xe9, 0x45, + 0xde, 0x93, 0x20, 0x63, 0xa9, 0xaa, 0x22, 0xf8, 0x42, 0x0d, + 0x84, 0x73, 0x47, 0xc5, 0xbc, 0x83, 0xf9, 0x53, 0x05, 0xdb, + 0x7c, 0xbe, 0x9c, 0xb0, 0xd5, 0x4a, 0x1c, 0xcd, 0xba, 0xa8, + 0xb2, 0xf6, 0x0a, 0x36, 0x70, 0x17, 0x1b, 0x9d, 0x4e, 0x8c, + 0xb0, 0xae, 0xdf, 0x75, 0x35, 0xdb, 0xf4, 0x93, 0x7f, 0x9f, + 0x3a, 0xaa, 0xa0, 0x59, 0x61, 0xdf, 0x6b, 0x23, 0xff, 0x36, + 0x4d, 0x2a, 0xbc, 0x80, 0x9a, 0x08, 0x8e, 0x67, 0x68, 0x72, + 0x3c, 0x14, 0x1b, 0xbd, 0x0c, 0x08, 0xd1, 0xcb, 0xfa, 0xf8, + 0x64, 0x9f, 0xf4, 0x93, 0x6f, 0x9f, 0x3a, 0xaa, 0xa0, 0x59, + 0x61, 0xdf, 0x69, 0x23, 0xbe, 0x36, 0x5d, 0x4a, 0xbc, 0x80, + 0xba, 0x48, 0x8e, 0x76, 0x68, 0x72, 0x3c, 0x17, 0x1b, 0xbd, + 0x0c, 0x88, 0xf1, 0xaa, 0xfa, 0xf8, 0x64, 0x9f, 0x0b, 0x6c, + 0x90, 0x60, 0xc5, 0x55, 0x5f, 0xa6, 0x9e, 0x20, 0x96, 0xdc, + 0x41, 0xc9, 0xa2, 0xb5, 0x43, 0x7f, 0x45, 0xb7, 0x71, 0x89, + 0x97, 0x8d, 0xc3, 0xe8, 0xe4, 0x42, 0xf3, 0x77, 0x0e, 0x55, + 0x05, 0x07, 0x9b, 0x60, 0x3b, 0xc3, 0x20, 0x74, 0xc0, 0x66, + 0xcf, 0xb0, 0x7c, 0xde, 0x76, 0x5a, 0xef, 0xaa, 0x0d, 0x0f, + 0xe7, 0xb8, 0x08, 0x58, 0xc1, 0x83, 0x67, 0x7e, 0x5b, 0x56, + 0x69, 0x45, 0x8b, 0xa6, 0x5f, 0xa5, 0x0c, 0xd4, 0x59, 0x3b, + 0xe7, 0xd5, 0x8a, 0xae, 0xd8, 0xe5, 0x09, 0xfc, 0xbf, 0xe6, + 0x48, 0x83, 0x51, 0x7e, 0xbb, 0x65, 0x2d, 0x3a, 0xc3, 0xfb, + 0xee, 0x3c, 0x9b, 0x34, 0x52, 0xed, 0x53, 0x3c, 0xeb, 0x3c, + 0x7f, 0x12, 0xc9, 0xf7, 0xb3, 0x6c, 0x44, 0x19, 0x41, 0x57, + 0xa8, 0x34, 0x45, 0x50, 0x01, 0x73, 0x85, 0x1c, 0x34, 0x7f, + 0x05, 0xf5, 0x0d, 0x13, 0x6f, 0xee, 0x7c, 0xe4, 0x7e, 0xbd, + 0xae, 0xf4, 0xc5, 0x12, 0xa4, 0xa2, 0xc6, 0x82, 0x3f, 0x4f, + 0x9c, 0xe4, 0xd8, 0xfa, 0x83, 0xa5, 0xaa, 0xf3, 0x56, 0x6e, + 0xe1, 0x36, 0x4e, 0x44, 0x8e, 0x94, 0xbf, 0x39, 0xb2, 0xfb, + 0x43, 0xc4, 0x28, 0xa1, 0x79, 0xad, 0x14, 0x94, 0xc9, 0xd9, + 0x32, 0x2d, 0x67, 0xc1, 0x39, 0x8e, 0x1e, 0xea, 0x0a, 0xf6, + 0x52, 0x72, 0x50, 0x2e, 0xea, 0xc5, 0x26, 0x8a, 0xd0, 0x1f, + 0x58, 0xb6, 0xb0, 0x63, 0x9f, 0x80, 0xf1, 0x75, 0xfc, 0xb0, + 0x0b, 0xf3, 0xfb, 0xf4, 0x3c, 0xc1, 0xa7, 0xf2, 0x97, 0xe8, + 0x7d, 0x73, 0x2e, 0xdd, 0x4f, 0x9e, 0xd3, 0x6c, 0x7a, 0x2c, + 0xe2, 0xc4, 0xb4, 0x72, 0xe6, 0x08, 0xb8, 0x3c, 0xfa, 0x10, + 0xbf, 0xd1, 0x77, 0x78, 0xfe, 0xa2, 0x9b, 0xfd, 0x3c, 0xf5, + 0xcd, 0x40, 0xa6, 0xe3, 0x16, 0xc9, 0x34, 0xb6, 0x17, 0xcb, + 0x4f, 0xde, 0xd2, 0x6c, 0x7a, 0x2c, 0xe2, 0xc4, 0xb4, 0x12, + 0xe6, 0x0b, 0xb8, 0x3c, 0xfa, 0x70, 0x9f, 0xd1, 0x75, 0x70, + 0xfe, 0xa2, 0x1b, 0xfd, 0x3c, 0xf4, 0xcd, 0x40, 0xa6, 0xe3, + 0x16, 0xc9, 0x3c, 0xb6, 0x1f, 0xcb, 0xb0, 0x21, 0x2d, 0x93, + 0x85, 0xd3, 0x1d, 0x3b, 0x4b, 0xed, 0x19, 0xf4, 0x47, 0xc3, + 0x05, 0x8f, 0x60, 0x2e, 0x8a, 0x8f, 0x01, 0x5d, 0xe4, 0x02, + 0xc3, 0x0b, 0x32, 0xbf, 0x59, 0x1c, 0xe9, 0x36, 0xc3, 0x49, + 0xe0, 0x34, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xe8, 0x14, 0x48, 0x7f, 0x8b, 0xeb, 0x9f, 0x7c, 0xca, 0xb0, + 0x52, 0x6e, 0x5f, 0x99, 0x14, 0xd0, 0xb5, 0x63, 0x02, 0x98, + 0x38, 0x27, 0xae, 0x2e, 0xe6, 0x53, 0xc4, 0x0b, 0x29, 0x25, + 0x51, 0xc9, 0x66, 0x59, 0x26, 0x57, 0x90, 0xd1, 0x8e, 0x40, + 0xf1, 0x46, 0x29, 0x9f, 0xce, 0xd6, 0xa1, 0xb5, 0x35, 0x3e, + 0xd6, 0x11, 0x29, 0x73, 0xba, 0xeb, 0x85, 0x88, 0x07, 0x27, + 0x45, 0x47, 0xdd, 0xac, 0xcc, 0xe8, 0x0b, 0x7e, 0x94, 0x62, + 0xbd, 0x9a, 0x4f, 0x9c, 0xe8, 0x8f, 0xa1, 0x8a, 0x0d, 0x8c, + 0xbb, 0x30, 0x7a, 0x90, 0xcb, 0x26, 0x0f, 0xd8, 0x5b, 0x4f, + 0x2b, 0x3a, 0xb3, 0xee, 0x82, 0x4c, 0xe8, 0xd1, 0x04, 0xcd, + 0xa4, 0x34, 0xbd, 0x08, 0xd9, 0x4e, 0x8b, 0xf2, 0x3b, 0x17, + 0xb9, 0x7d, 0x9a, 0xc5, 0xa8, 0x8c, 0xd3, 0xf7, 0x47, 0x26, + 0xa0, 0xb2, 0x1b, 0xb9, 0x6b, 0x33, 0xbe, 0x76, 0x5a, 0x6d, + 0xdb, 0x31, 0x29, 0x79, 0xd5, 0x8c, 0x9b, 0x5c, 0xd1, 0xce, + 0xc0, 0x60, 0xee, 0x26, 0x8a, 0xd1, 0x76, 0x1e, 0x0e, 0xcf, + 0xa9, 0xe0, 0xc7, 0xf8, 0x36, 0x5f, 0x58, 0xaa, 0xd4, 0xcd, + 0x4c, 0x21, 0x60, 0xaf, 0xaf, 0x69, 0x16, 0x92, 0x3a, 0xf3, + 0x55, 0xf5, 0x5f, 0x39, 0xec, 0x41, 0x1f, 0x0b, 0xf4, 0xc3, + 0x4c, 0x17, 0x60, 0x1c, 0xb6, 0xcf, 0x09, 0x9e, 0x56, 0x9c, + 0xba, 0x86, 0xfa, 0xab, 0xd4, 0x9e, 0x5e, 0x9a, 0xa1, 0x97, + 0xaf, 0xa3, 0x24, 0xb5, 0x1a, 0xfb, 0x65, 0x9d, 0xe9, 0x2f, + 0xb8, 0x08, 0x8d, 0x08, 0x45, 0x43, 0x4c, 0x17, 0x60, 0x1c, + 0xb6, 0xcf, 0x89, 0x8e, 0x77, 0xd8, 0xba, 0x86, 0xfa, 0xaa, + 0xd6, 0x9e, 0x5e, 0x3a, 0xa0, 0x97, 0xaf, 0xa3, 0x14, 0xb5, + 0x1a, 0xfb, 0x45, 0x9d, 0xf9, 0x2f, 0xb8, 0x08, 0x8f, 0x0a, + 0xe5, 0x43, 0xb3, 0xe8, 0x9f, 0xe3, 0x49, 0x30, 0x76, 0x71, + 0x88, 0x27, 0x45, 0x79, 0x05, 0x55, 0x29, 0x61, 0xa1, 0xc5, + 0x5f, 0x68, 0x50, 0x5c, 0xeb, 0x4a, 0xe5, 0x04, 0xba, 0x62, + 0x06, 0xd0, 0x47, 0xf7, 0x70, 0xf5, 0x1a, 0xbc, 0xa2, 0x30, + 0x75, 0x19, 0x16, 0x9a, 0x6f, 0x93, 0x72, 0xeb, 0x7a, 0x9b, + 0x3c, 0xa4, 0xd0, 0xb3, 0x5f, 0x0f, 0x74, 0xa4, 0x89, 0xab, + 0xe4, 0x58, 0x80, 0x28, 0x1c, 0xb6, 0x1e, 0x71, 0xd8, 0x1f, + 0xac, 0xe1, 0x14, 0x0b, 0x7d, 0xce, 0xdd, 0x0d, 0xf3, 0x9e, + 0xb7, 0x29, 0x53, 0xc4, 0x92, 0x28, 0xfc, 0xb8, 0x43, 0x55, + 0x31, 0xee, 0x42, 0x18, 0xb9, 0x6d, 0xb2, 0xe0, 0x74, 0x7a, + 0x86, 0x52, 0xf7, 0x71, 0x2c, 0x21, 0x6f, 0x4e, 0xad, 0x80, + 0xf4, 0xa9, 0xc8, 0xd7, 0xbf, 0x8b, 0x95, 0x27, 0xfd, 0x2e, + 0x88, 0x84, 0xeb, 0x0b, 0x45, 0xbb, 0xf0, 0x0c, 0x16, 0x00, + 0x89, 0x96, 0xa5, 0x39, 0xee, 0x7c, 0xbe, 0x7b, 0xbf, 0xec, + 0xfa, 0xe4, 0xa4, 0xde, 0x0d, 0x00, 0xf4, 0xa1, 0xc6, 0xb4, + 0x38, 0xc5, 0xa8, 0xd5, 0x26, 0x5c, 0xc5, 0x89, 0x78, 0x0c, + 0x64, 0xdb, 0x9b, 0x3b, 0x7f, 0x9d, 0x30, 0x9f, 0xe0, 0xa9, + 0x8a, 0xdb, 0xa3, 0x66, 0x5f, 0x47, 0x6e, 0x25, 0x6e, 0x07, + 0x1d, 0x3f, 0x0e, 0xae, 0x85, 0x10, 0x2c, 0x79, 0x84, 0xa7, + 0xe7, 0xc0, 0xf4, 0x04, 0xc0, 0xae, 0xc1, 0xe0, 0x5b, 0x20, + 0x64, 0x30, 0xe2, 0x3c, 0x85, 0x8b, 0xeb, 0x0f, 0x9d, 0xcc, + 0x60, 0xe8, 0xc0, 0x7e, 0xd0, 0x82, 0x2b, 0xd3, 0x8c, 0xa7, + 0xc4, 0x5d, 0x2e, 0xd1, 0xa4, 0xeb, 0x83, 0xa0, 0xa4, 0x17, + 0x89, 0x9f, 0xdd, 0x61, 0xcb, 0x11, 0x62, 0x31, 0xb2, 0x8d, + 0x90, 0x89, 0xb0, 0x8a, 0x88, 0xe8, 0xe3, 0xfb, 0xc5, 0x3e, + 0x48, 0x96, 0x2f, 0xd8, 0x8c, 0xa3, 0xc4, 0x5d, 0x2e, 0xd1, + 0xa4, 0xa3, 0xc3, 0x80, 0xa4, 0x05, 0xc8, 0x8f, 0xd5, 0x61, + 0xcb, 0x11, 0x66, 0x31, 0xb0, 0x8d, 0x90, 0x89, 0xb8, 0x0a, + 0x88, 0xc8, 0xe3, 0xfb, 0xc5, 0x3e, 0x48, 0xb6, 0x2f, 0xda, + 0x73, 0x5c, 0x3b, 0xa2, 0xd1, 0x2e, 0x5b, 0x5c, 0x3c, 0x7f, + 0x5b, 0xfa, 0x37, 0x70, 0x2a, 0x9e, 0x34, 0xee, 0x99, 0xce, + 0x4f, 0x72, 0x6f, 0x76, 0x47, 0xf5, 0x77, 0x37, 0x1c, 0x04, + 0x3a, 0xc1, 0xb7, 0x69, 0xd0, 0x25, 0x07, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x6d, 0x7e, 0xe7, 0x0c, 0xf1, 0xbe, + 0xc7, 0x85, 0x6a, 0xb1, 0x59, 0xe4, 0xba, 0x42, 0x1d, 0xfb, + 0x49, 0xf7, 0xb7, 0x21, 0x84, 0x1b, 0x90, 0xd9, 0x66, 0xa9, + 0x39, 0x79, 0xab, 0x89, 0xfb, 0x16, 0x68, 0x18, 0xd1, 0x2a, + 0x23, 0xda, 0xf3, 0xb2, 0xb5, 0xae, 0x1c, 0xd5, 0xed, 0x91, + 0xf7, 0xe3, 0x67, 0x09, 0x0a, 0x11, 0xf0, 0xf1, 0x92, 0x9d, + 0xde, 0xc4, 0x9a, 0x32, 0xa4, 0xb5, 0xa2, 0xa6, 0x68, 0x73, + 0xd3, 0xe3, 0x12, 0x0c, 0x93, 0x8f, 0x6f, 0x4f, 0x5b, 0xac, + 0xd0, 0x65, 0xb7, 0x72, 0xc7, 0x23, 0xb4, 0xa8, 0x10, 0x30, + 0x58, 0x32, 0xad, 0x3c, 0x7d, 0x86, 0xd0, 0xbe, 0xc6, 0x1d, + 0x3f, 0x04, 0xdb, 0xbc, 0x56, 0x74, 0x20, 0x4f, 0xa4, 0xc6, + 0x9e, 0x7a, 0xc5, 0xea, 0x03, 0xa1, 0x21, 0x5e, 0x2e, 0xe9, + 0xd3, 0xad, 0xc2, 0xd2, 0xf4, 0xd2, 0xe7, 0x33, 0xe3, 0x1c, + 0xb5, 0xbc, 0x26, 0x4b, 0x0c, 0x8f, 0x3d, 0xbd, 0x6c, 0x21, + 0xc2, 0x98, 0xc7, 0x5d, 0x65, 0x7a, 0x46, 0x71, 0x7b, 0xe3, + 0xa5, 0xe4, 0xd2, 0x2d, 0x10, 0xe7, 0x29, 0x36, 0x44, 0x2e, + 0x56, 0x79, 0x52, 0xad, 0xb4, 0x60, 0x7f, 0xb6, 0x5a, 0x48, + 0x03, 0x09, 0x98, 0x74, 0x73, 0x7c, 0xab, 0xc1, 0x09, 0x21, + 0x21, 0x7c, 0x48, 0x6c, 0x81, 0x42, 0x89, 0x94, 0xf4, 0x7d, + 0x1e, 0xdf, 0x8b, 0xd3, 0x80, 0xbb, 0x79, 0x96, 0xd9, 0x15, + 0x38, 0x41, 0x2f, 0x21, 0xc8, 0x09, 0x04, 0x88, 0xb8, 0x24, + 0x20, 0x78, 0x92, 0x65, 0x7d, 0x75, 0x1f, 0xfb, 0xfa, 0x6d, + 0x81, 0xe2, 0xa9, 0xd4, 0xf4, 0x7d, 0x1e, 0xdf, 0x8b, 0xf7, + 0x84, 0xbb, 0x79, 0xd6, 0xdb, 0x15, 0xa8, 0x60, 0x2f, 0x30, + 0x5a, 0x09, 0x0c, 0x88, 0xb8, 0x24, 0x28, 0x78, 0x9a, 0x65, + 0x6d, 0x75, 0x0d, 0xff, 0xfa, 0x6d, 0x7e, 0x1d, 0x56, 0x2b, + 0x0b, 0x82, 0xe1, 0x20, 0x74, 0x08, 0x7b, 0x44, 0x86, 0x29, + 0x24, 0xea, 0x57, 0x9f, 0xd0, 0xcf, 0xa5, 0xf6, 0xf3, 0x77, + 0x47, 0xdb, 0xd7, 0x87, 0x65, 0x9a, 0x92, 0x8a, 0xf2, 0x00, + 0x05, 0x92, 0x19, 0x2a, 0x53, 0x46, 0x71, 0x4d, 0x03, 0xf2, + 0x9c, 0x90, 0x95, 0x6a, 0x26, 0x2b, 0x66, 0xae, 0x2d, 0x89, + 0x0e, 0x1e, 0x95, 0x73, 0x1c, 0xbe, 0x89, 0xa0, 0x3d, 0x24, + 0x04, 0x0c, 0x06, 0xc0, 0x6e, 0x21, 0x92, 0x1c, 0xd4, 0x81, + 0x43, 0xd5, 0xbe, 0xdb, 0xda, 0x11, 0xf5, 0x7d, 0xaf, 0x66, + 0xe2, 0xd7, 0x14, 0xc6, 0x1a, 0x47, 0x1c, 0xe0, 0x12, 0x4b, + 0xb9, 0x3b, 0xe6, 0xe5, 0x6b, 0x3b, 0x63, 0x11, 0x9a, 0x48, + 0xd3, 0xe7, 0xd4, 0xb5, 0xd9, 0xdf, 0x5c, 0xb3, 0x8e, 0x17, + 0x04, 0xbc, 0x54, 0x55, 0x02, 0xb8, 0x28, 0x6f, 0x54, 0x4b, + 0x9b, 0x67, 0x6a, 0x98, 0xe7, 0xb3, 0x1c, 0x2f, 0xf5, 0x5a, + 0x10, 0x18, 0x48, 0x77, 0x67, 0x9f, 0xbe, 0x88, 0xdc, 0xa3, + 0xb4, 0xcb, 0x64, 0x20, 0x98, 0xfb, 0x64, 0x78, 0x11, 0xaa, + 0x31, 0x7a, 0x3c, 0x2b, 0x25, 0x9a, 0x15, 0xc6, 0xc8, 0x69, + 0x5e, 0xc3, 0x9b, 0x4c, 0xe4, 0xec, 0x58, 0xfa, 0x08, 0x71, + 0xa1, 0x79, 0x8f, 0xeb, 0xd8, 0x41, 0xf2, 0x1f, 0xe0, 0x32, + 0x0a, 0x08, 0x1c, 0x90, 0x7f, 0xf7, 0x6c, 0xf5, 0x31, 0x4d, + 0x55, 0x5c, 0xc0, 0x9c, 0xdb, 0x94, 0x14, 0xac, 0x32, 0xd2, + 0xbf, 0xf8, 0xb4, 0x29, 0x4a, 0xec, 0x1e, 0xc4, 0x3a, 0xfc, + 0x34, 0x74, 0x36, 0x0e, 0xe0, 0xb1, 0x3a, 0x09, 0x82, 0xfa, + 0x5d, 0x6d, 0x33, 0x99, 0x65, 0x0f, 0x79, 0x0e, 0xf2, 0x27, + 0x03, 0xa7, 0x14, 0xa0, 0x18, 0xe3, 0x8d, 0xe4, 0xb0, 0x2b, + 0x68, 0x99, 0x09, 0xa6, 0xdb, 0xd2, 0x12, 0x3a, 0xb6, 0x0f, + 0xe0, 0xb1, 0x1a, 0x09, 0x80, 0xfa, 0x5d, 0xed, 0x31, 0x99, + 0x65, 0x4f, 0x7d, 0x0e, 0xf1, 0x27, 0x03, 0xb5, 0x14, 0xa0, + 0x18, 0xe3, 0x8d, 0xe4, 0xb0, 0x2b, 0x68, 0x99, 0x0b, 0xa4, + 0xdb, 0xd8, 0x12, 0x3a, 0x49, 0xf0, 0x1f, 0x4e, 0xe5, 0xf6, + 0x7f, 0x05, 0xa2, 0x12, 0xce, 0x66, 0x9a, 0xb0, 0x82, 0xf1, + 0x0e, 0xd8, 0xfc, 0x4a, 0xeb, 0x5f, 0xe7, 0x1c, 0x72, 0x1b, + 0x4f, 0xd4, 0x97, 0x66, 0xf4, 0x5b, 0x24, 0x27, 0xed, 0xc5, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xd2, 0xc2, + 0x63, 0x6b, 0x40, 0x0d, 0x57, 0xed, 0x9b, 0xb6, 0x8f, 0x44, + 0xcb, 0x86, 0x60, 0x0a, 0xc3, 0xcd, 0xc2, 0x66, 0x23, 0xbf, + 0xa1, 0x17, 0x66, 0x3e, 0xb9, 0xb6, 0x91, 0xc0, 0x53, 0x8a, + 0xf6, 0x70, 0x08, 0x76, 0xb0, 0x07, 0x46, 0xe9, 0xec, 0x5d, + 0x6e, 0x02, 0x4e, 0x89, 0x69, 0x25, 0x9c, 0x53, 0xd7, 0x41, + 0x04, 0x51, 0x55, 0xfa, 0x4b, 0x8e, 0x37, 0x44, 0xbf, 0xe5, + 0x14, 0x86, 0xfd, 0x64, 0x5b, 0x61, 0x5c, 0x65, 0x06, 0x48, + 0x3d, 0x26, 0x86, 0x06, 0x34, 0x34, 0x61, 0x70, 0x7b, 0x09, + 0x1b, 0xb1, 0xf2, 0xaa, 0xd8, 0x0d, 0xa1, 0x23, 0x73, 0xdb, + 0x28, 0x32, 0xac, 0x3d, 0x81, 0x5e, 0x1f, 0x9d, 0xa0, 0x98, + 0xb3, 0xf9, 0xf3, 0xba, 0xbd, 0xff, 0x74, 0x50, 0x3a, 0x55, + 0x6d, 0xae, 0x0e, 0xbc, 0xc8, 0x92, 0x1f, 0x02, 0x78, 0xe2, + 0xd3, 0xbe, 0xcf, 0xc7, 0xa2, 0x19, 0x11, 0x2c, 0x47, 0x97, + 0x28, 0xc8, 0x38, 0xc6, 0xd8, 0x1e, 0xb2, 0x7e, 0xdf, 0x70, + 0xde, 0x20, 0x62, 0x29, 0x96, 0x83, 0x8c, 0x32, 0x7f, 0x45, + 0x18, 0x6f, 0xff, 0xae, 0x49, 0xb8, 0x2d, 0x0d, 0xe1, 0xd3, + 0xaf, 0x66, 0x7e, 0x79, 0x59, 0x0e, 0xa3, 0xfb, 0x07, 0x13, + 0x6d, 0x8c, 0xa0, 0x6f, 0x7a, 0x4c, 0x81, 0x4d, 0x24, 0x41, + 0xff, 0x02, 0x49, 0x1e, 0x6d, 0x4e, 0x8b, 0x17, 0xf6, 0xac, + 0x40, 0x8b, 0xfd, 0x2d, 0x04, 0xf0, 0x07, 0x49, 0x01, 0x78, + 0x53, 0x09, 0x09, 0xeb, 0x1d, 0x13, 0xa9, 0xc9, 0xa0, 0x3c, + 0x56, 0xfc, 0x25, 0x4f, 0x24, 0x41, 0xbe, 0x03, 0x4d, 0x1e, + 0x6d, 0x4c, 0x89, 0x17, 0xf6, 0xad, 0x40, 0x8b, 0xfd, 0x2d, + 0x05, 0xf0, 0x27, 0x69, 0x01, 0x78, 0x53, 0x09, 0x09, 0xeb, + 0x1d, 0x13, 0xe9, 0xcc, 0xa0, 0x3c, 0x5e, 0xfc, 0xa1, 0x4f, + 0xdb, 0xbe, 0x41, 0xfc, 0xb2, 0xe1, 0x92, 0xb3, 0x76, 0xe8, + 0x09, 0x52, 0xbf, 0x74, 0x02, 0xd2, 0xfa, 0x0f, 0xd8, 0x96, + 0xfe, 0x87, 0xac, 0xf6, 0xf6, 0x14, 0xe2, 0xec, 0x16, 0x33, + 0x5f, 0xc3, 0xa1, 0x03, 0x5e, 0xb0, 0x92, 0xd5, 0x27, 0x71, + 0x63, 0x8b, 0x06, 0xbe, 0x52, 0xdb, 0x4b, 0x28, 0xb4, 0x83, + 0xee, 0x62, 0xd4, 0xdc, 0x85, 0xfc, 0x1f, 0xcb, 0xb5, 0x3a, + 0x39, 0x42, 0x2e, 0xa4, 0x6a, 0x78, 0xe5, 0x10, 0xe6, 0x6b, + 0xe2, 0x7f, 0x03, 0xcf, 0xbd, 0x9f, 0x0c, 0x9a, 0x6e, 0x07, + 0x4b, 0x42, 0xb8, 0x5f, 0x12, 0x6e, 0x90, 0x2e, 0xf4, 0xce, + 0x22, 0xbe, 0x18, 0x05, 0x85, 0x0d, 0xb4, 0xe7, 0x53, 0xbb, + 0x9b, 0x4f, 0x52, 0x29, 0xbf, 0x72, 0x14, 0x24, 0x28, 0x36, + 0x20, 0xe6, 0x9f, 0x5a, 0xb2, 0xa4, 0xdf, 0x4a, 0xf7, 0xf9, + 0xb8, 0xd7, 0x0c, 0x42, 0x99, 0xad, 0x58, 0xae, 0x70, 0x06, + 0x97, 0xde, 0x39, 0x3d, 0x20, 0xca, 0xbc, 0x36, 0xd4, 0x74, + 0x81, 0xa6, 0x1c, 0x47, 0x85, 0x65, 0xde, 0xb5, 0xe1, 0x92, + 0x8c, 0x4f, 0x67, 0x39, 0x61, 0x83, 0xff, 0x36, 0x25, 0xee, + 0x15, 0x45, 0x78, 0x52, 0xca, 0xdc, 0x9a, 0xee, 0x10, 0xda, + 0xd9, 0x09, 0x1a, 0xc6, 0x7a, 0xfb, 0x57, 0xc1, 0x95, 0x5c, + 0x1b, 0x35, 0xc6, 0x95, 0xd3, 0x37, 0xed, 0xe6, 0xd9, 0xf9, + 0x4f, 0x9a, 0x7e, 0x9d, 0x0f, 0x2c, 0x6b, 0xde, 0x8c, 0x39, + 0x6d, 0x7e, 0x74, 0xe1, 0x4b, 0x7b, 0xaf, 0xbd, 0x08, 0xa9, + 0x9d, 0x1b, 0x85, 0x60, 0x19, 0xac, 0x54, 0x55, 0x43, 0x1d, + 0x53, 0x76, 0xf2, 0xae, 0xbd, 0x59, 0x66, 0x92, 0x99, 0x66, + 0x52, 0xe8, 0x55, 0x9f, 0x2d, 0x2c, 0x0b, 0xbf, 0xa4, 0x60, + 0x56, 0xf9, 0x6f, 0x6f, 0x40, 0xa4, 0xb5, 0x5b, 0xb3, 0xc1, + 0x23, 0x85, 0x54, 0x51, 0x42, 0x15, 0x53, 0x36, 0xfe, 0xae, + 0xbf, 0x59, 0x6f, 0x92, 0x9f, 0x74, 0x46, 0xa8, 0x55, 0x9f, + 0x2d, 0x2c, 0x0b, 0x7f, 0xb4, 0xe0, 0x12, 0xf9, 0x6f, 0x2f, + 0x48, 0xa4, 0xb5, 0x5b, 0xb3, 0xc0, 0x13, 0x85, 0xab, 0xae, + 0xbd, 0xea, 0xac, 0xc9, 0x01, 0x51, 0x40, 0xa6, 0x90, 0x6d, + 0x60, 0x8b, 0xb9, 0x57, 0xaa, 0x60, 0xd2, 0xd3, 0xf4, 0x80, + 0x4b, 0x1f, 0xed, 0x06, 0x90, 0xd0, 0xb7, 0x5b, 0x4a, 0xa4, + 0x4c, 0x3f, 0xec, 0x7a, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x62, 0x0f, 0xc7, 0x45, 0x3e, 0x48, 0xc7, 0x20, + 0xaf, 0x96, 0x33, 0x26, 0xd7, 0xfc, 0x52, 0x2f, 0x0d, 0x11, + 0xc7, 0x22, 0x77, 0xb8, 0x15, 0x45, 0x6c, 0x66, 0x0e, 0x3b, + 0x26, 0x6e, 0xd1, 0x3c, 0x09, 0xa2, 0x73, 0xc4, 0x59, 0x82, + 0xe4, 0xb3, 0xae, 0x9f, 0x02, 0xd3, 0x05, 0xb2, 0xef, 0xdd, + 0x24, 0x38, 0xc0, 0x21, 0xf4, 0x84, 0xd7, 0x8b, 0xa3, 0xcb, + 0x3a, 0x55, 0x86, 0xa8, 0x18, 0xab, 0xc0, 0x22, 0xcb, 0x11, + 0x10, 0x5b, 0x61, 0x5b, 0xfa, 0x8b, 0x12, 0x0c, 0x82, 0x01, + 0xab, 0x48, 0x78, 0x53, 0x88, 0x71, 0x6a, 0x84, 0x11, 0x29, + 0xc5, 0x19, 0x70, 0x48, 0xae, 0x6b, 0x30, 0xcc, 0xfe, 0xd9, + 0xe1, 0xf2, 0x02, 0xab, 0x28, 0x15, 0x87, 0x30, 0xb7, 0xba, + 0xa7, 0x80, 0xb4, 0x52, 0x08, 0x9f, 0xf6, 0xf1, 0xbc, 0x9a, + 0xfc, 0x33, 0xac, 0xa1, 0x35, 0x92, 0xb9, 0x72, 0xb2, 0x5a, + 0xab, 0x89, 0x5c, 0xfc, 0x39, 0xac, 0x88, 0xac, 0x87, 0xb6, + 0xbf, 0x55, 0x95, 0xa9, 0x01, 0xcf, 0x76, 0xc3, 0x6b, 0x8e, + 0xbd, 0x4b, 0x6f, 0x31, 0x57, 0xd9, 0x0a, 0x25, 0x48, 0xfd, + 0xe6, 0x88, 0x8b, 0xc1, 0xfd, 0xd2, 0xfd, 0xc4, 0x49, 0x8c, + 0xf3, 0x84, 0xfb, 0xfe, 0x46, 0x8f, 0xf6, 0xea, 0x0d, 0xc3, + 0x60, 0xd4, 0xb7, 0xc7, 0x6b, 0x4e, 0xc0, 0x03, 0xd2, 0xa3, + 0x9d, 0xaa, 0x1a, 0x26, 0x2e, 0xdd, 0xbd, 0x08, 0xab, 0xdc, + 0xf5, 0x56, 0xba, 0xc8, 0xda, 0xcc, 0xf3, 0xad, 0xfd, 0xed, + 0xa6, 0xfe, 0xf1, 0x38, 0x4f, 0x64, 0x20, 0xd2, 0xb7, 0xc3, + 0x6b, 0x4e, 0xd4, 0x03, 0xf2, 0x21, 0x9d, 0x8a, 0x1a, 0x27, + 0x2e, 0xdd, 0x2d, 0x08, 0xab, 0xd8, 0xf5, 0x56, 0xba, 0xc8, + 0x58, 0xcc, 0xf3, 0xac, 0xfd, 0xee, 0xa6, 0xbe, 0xf1, 0x38, + 0x4d, 0xe9, 0x20, 0xd2, 0x48, 0x3c, 0x94, 0xb1, 0x2b, 0xfc, + 0x0d, 0xde, 0x62, 0x75, 0xe5, 0xd8, 0xd1, 0x22, 0xd2, 0xf7, + 0x54, 0x27, 0x0a, 0xa9, 0x45, 0x37, 0xa7, 0x33, 0x0c, 0x53, + 0x02, 0x11, 0x59, 0x41, 0x0e, 0xc7, 0xb2, 0x16, 0xdf, 0x2d, + 0x69, 0xe7, 0x0a, 0x57, 0x1c, 0x6d, 0xd2, 0x4f, 0xd9, 0xaf, + 0x9e, 0x36, 0xf2, 0x12, 0x55, 0x57, 0x3b, 0x05, 0x04, 0x1c, + 0xad, 0xc6, 0x81, 0xd1, 0x14, 0x65, 0x28, 0x4c, 0x1a, 0x9d, + 0x7a, 0x13, 0x56, 0x12, 0xd3, 0x30, 0x15, 0x82, 0xcf, 0x21, + 0x4f, 0x8e, 0x2c, 0x6e, 0x02, 0x3f, 0x0a, 0x1e, 0xd3, 0x08, + 0x47, 0x51, 0x1a, 0x30, 0x88, 0xcd, 0xba, 0xbb, 0x8e, 0xde, + 0x2c, 0xbf, 0x02, 0xd6, 0xa8, 0xcc, 0x17, 0xe8, 0xc7, 0xb0, + 0xf9, 0x06, 0x8e, 0xe5, 0xcc, 0xfb, 0x7b, 0x02, 0xb7, 0x5a, + 0x7a, 0x81, 0x57, 0x40, 0x74, 0x6a, 0xe1, 0x60, 0xf5, 0x94, + 0x90, 0x32, 0x2b, 0xba, 0x6f, 0x67, 0x05, 0xa6, 0xe9, 0xc5, + 0x12, 0xb1, 0x62, 0xa7, 0xe5, 0xdb, 0xa9, 0xb0, 0x79, 0xf4, + 0x93, 0x4b, 0x38, 0xd4, 0x59, 0x72, 0xba, 0x97, 0x67, 0x9c, + 0x47, 0x65, 0xe2, 0x06, 0x19, 0x4c, 0x22, 0xfc, 0x66, 0x4e, + 0x4e, 0x17, 0xb1, 0x64, 0x3c, 0x99, 0x7d, 0xce, 0xd9, 0x80, + 0xb1, 0xde, 0xb0, 0x70, 0xb6, 0x1a, 0x36, 0x91, 0x7c, 0x88, + 0xe4, 0x49, 0x18, 0x7d, 0x0f, 0xfc, 0x56, 0xb3, 0x25, 0xb1, + 0x12, 0x00, 0xf3, 0x77, 0x7a, 0xe7, 0xe4, 0xd2, 0xd8, 0x99, + 0xdb, 0x21, 0x91, 0x0a, 0x0e, 0xef, 0xb2, 0x44, 0x61, 0x6d, + 0x26, 0xb2, 0x3a, 0xca, 0x28, 0x50, 0x84, 0x6d, 0x0b, 0xb8, + 0x47, 0x58, 0x19, 0x86, 0xf3, 0x65, 0x12, 0x96, 0x55, 0xef, + 0xe2, 0xd3, 0x8c, 0xe0, 0xe1, 0x53, 0xbd, 0x00, 0x93, 0xc3, + 0x4a, 0x6f, 0xb2, 0xd8, 0x80, 0x6d, 0x26, 0xb2, 0x32, 0xdb, + 0x28, 0xd0, 0x84, 0x69, 0x1b, 0x39, 0x47, 0xdc, 0x14, 0xa6, + 0xf3, 0x65, 0x12, 0x14, 0x55, 0xff, 0x62, 0xf3, 0x8c, 0xc2, + 0xe1, 0x51, 0xbd, 0x00, 0x93, 0x8b, 0x0a, 0x6f, 0xb2, 0xd8, + 0x00, 0x6d, 0xd9, 0x4d, 0xcd, 0x24, 0xd7, 0x2f, 0x7b, 0x96, + 0xe4, 0xc6, 0xb8, 0x23, 0xeb, 0x59, 0x0c, 0x9a, 0xed, 0xeb, + 0xaa, 0x00, 0x9d, 0x0c, 0x73, 0x3d, 0x1e, 0xae, 0x42, 0xff, + 0x6c, 0x74, 0xf5, 0x90, 0x4d, 0x27, 0xff, 0x92, 0x04, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xff, 0x2a, 0x32, 0x76, + 0x13, 0x21, 0x2b, 0x93, 0x80, 0xbd, 0xb3, 0x32, 0x28, 0xa0, + 0x66, 0x6d, 0x12, 0x27, 0xce, 0xb5, 0x07, 0x1c, 0x6c, 0x5d, + 0x87, 0x15, 0x2c, 0xf5, 0x4d, 0x69, 0xd3, 0x9c, 0x4b, 0xb8, + 0x54, 0xa5, 0x1b, 0xcc, 0x35, 0x97, 0x07, 0xb8, 0x7e, 0xb9, + 0xca, 0xa6, 0xb8, 0x31, 0x84, 0x7f, 0x18, 0x5c, 0x4a, 0x62, + 0xfe, 0x8b, 0xe3, 0x8d, 0x7b, 0x9d, 0x97, 0x62, 0xc3, 0x53, + 0xf0, 0x8d, 0x8d, 0x98, 0x37, 0x96, 0xd2, 0xea, 0x5e, 0x2d, + 0xe9, 0xb8, 0x10, 0xc2, 0x40, 0x1b, 0xac, 0x2e, 0xb5, 0x88, + 0x7c, 0x5a, 0x9a, 0x06, 0xcf, 0x6b, 0x91, 0xd1, 0x9f, 0xc5, + 0x3c, 0x62, 0x6c, 0x84, 0xc2, 0x59, 0xe9, 0xad, 0x2d, 0x31, + 0xf3, 0x38, 0x36, 0xaf, 0xd4, 0x11, 0x37, 0x09, 0xee, 0x89, + 0x0e, 0x57, 0x00, 0xca, 0xb4, 0x98, 0x1d, 0x94, 0xad, 0x8c, + 0x62, 0x8b, 0x53, 0x40, 0xf5, 0x27, 0x59, 0x82, 0x58, 0x63, + 0xc2, 0x63, 0x56, 0x6f, 0x16, 0x89, 0x63, 0xd6, 0x21, 0xbc, + 0xe7, 0xf4, 0xef, 0x86, 0x04, 0xff, 0xb2, 0xcf, 0x97, 0x03, + 0xa3, 0xc1, 0xc9, 0xb3, 0x93, 0xeb, 0xc9, 0x49, 0x38, 0x12, + 0xb9, 0x73, 0x5a, 0x7c, 0xf9, 0x7a, 0xc8, 0x05, 0x30, 0x64, + 0x3d, 0xf8, 0x2f, 0xc2, 0x85, 0xe2, 0xe4, 0xb0, 0x66, 0x8b, + 0xb4, 0xda, 0x6f, 0xde, 0x98, 0x63, 0xab, 0x89, 0x9b, 0x3a, + 0x16, 0xda, 0x79, 0x0f, 0x59, 0x12, 0x03, 0x57, 0xd1, 0x94, + 0x1a, 0x7a, 0xc2, 0x61, 0x20, 0xe5, 0xb1, 0x9e, 0x26, 0x9e, + 0x76, 0xe8, 0xe4, 0xb0, 0x67, 0x8b, 0x94, 0x9a, 0x6e, 0xde, + 0x98, 0x23, 0xab, 0x81, 0x99, 0xb2, 0x16, 0xda, 0x79, 0x0f, + 0x79, 0x12, 0x03, 0x57, 0xd9, 0x94, 0x5a, 0x7a, 0xca, 0x61, + 0x30, 0xe5, 0xb1, 0x9c, 0x26, 0x9e, 0x75, 0xe8, 0x1b, 0x4f, + 0x98, 0x74, 0x6b, 0x65, 0x91, 0x21, 0x67, 0xdc, 0x54, 0x7e, + 0x66, 0x4d, 0xe9, 0x25, 0x86, 0xf0, 0x86, 0xed, 0xfc, 0xa8, + 0x26, 0x6b, 0xa5, 0x85, 0x35, 0x9e, 0xcf, 0x1a, 0x4e, 0x63, + 0xd9, 0x61, 0x8a, 0x17, 0x86, 0x89, 0x67, 0x5a, 0x21, 0x61, + 0xee, 0xf3, 0xd1, 0x93, 0x30, 0x01, 0x34, 0xeb, 0xd1, 0x76, + 0x6b, 0x92, 0x6f, 0x67, 0x13, 0xdd, 0x5f, 0x33, 0x15, 0xda, + 0x42, 0x4c, 0x5b, 0xae, 0x0f, 0xa7, 0xb2, 0x5e, 0x1c, 0x71, + 0xfa, 0x4c, 0xe9, 0x5f, 0x35, 0xcb, 0x64, 0xe4, 0x2b, 0x3d, + 0xc8, 0xc1, 0xc5, 0xa1, 0x6a, 0x32, 0xd7, 0x48, 0x9f, 0xe3, + 0xd6, 0xba, 0xdc, 0x7d, 0x8d, 0x3f, 0x82, 0x6f, 0xbf, 0x42, + 0x57, 0xa1, 0x10, 0xf7, 0x98, 0x3e, 0x37, 0x3c, 0xcb, 0x03, + 0x9d, 0x83, 0x3f, 0xe3, 0xae, 0x92, 0x50, 0x49, 0xc6, 0x01, + 0x40, 0xd2, 0xbc, 0xb7, 0xb5, 0xce, 0xac, 0x3e, 0x79, 0x2b, + 0xe4, 0x3d, 0x16, 0xec, 0x3f, 0x69, 0xa9, 0xe9, 0x3f, 0xd7, + 0x9e, 0x79, 0xa7, 0xe7, 0x12, 0x01, 0xae, 0x3b, 0x70, 0x5b, + 0x98, 0x13, 0x53, 0xe6, 0x92, 0x69, 0xcc, 0xe3, 0x4c, 0xe4, + 0x01, 0x47, 0x75, 0x90, 0x2a, 0x69, 0xef, 0x5a, 0xfa, 0xa4, + 0x3d, 0x3b, 0x0c, 0x2f, 0xa9, 0xdf, 0xdc, 0x8f, 0x9a, 0x8f, + 0x2c, 0x9b, 0x7a, 0x1d, 0x91, 0xd0, 0x63, 0x1a, 0xfe, 0xce, + 0xf9, 0x9b, 0xf7, 0x05, 0xf6, 0xef, 0x82, 0xc3, 0xe1, 0x19, + 0x10, 0x16, 0x13, 0xe8, 0x8e, 0x71, 0x88, 0xfc, 0xe6, 0x00, + 0x18, 0xe2, 0xdf, 0xec, 0x8e, 0xed, 0x4f, 0x03, 0xff, 0x8f, + 0x90, 0xfb, 0x2e, 0x22, 0xc5, 0x04, 0x9d, 0x38, 0xb6, 0x66, + 0xb6, 0x37, 0x91, 0xd5, 0x51, 0xf3, 0x38, 0x22, 0xae, 0x60, + 0x92, 0xa0, 0xc8, 0x5c, 0x0e, 0x25, 0x3a, 0x52, 0x54, 0x5c, + 0x8e, 0xed, 0x0e, 0x03, 0xfe, 0x8f, 0x90, 0xda, 0x2b, 0x32, + 0xc5, 0x04, 0x9d, 0x39, 0xb6, 0xe6, 0xb6, 0x37, 0x91, 0xd5, + 0x51, 0xf9, 0x38, 0x22, 0xae, 0x60, 0x9a, 0xa1, 0xc8, 0x5c, + 0x0e, 0x25, 0x3a, 0x52, 0xd4, 0x6c, 0x71, 0x12, 0xf1, 0xfc, + 0x01, 0x70, 0x6f, 0x25, 0xd4, 0xcd, 0x3a, 0xfb, 0x62, 0xc6, + 0x49, 0x19, 0x49, 0xc8, 0x6e, 0x2a, 0xae, 0x06, 0xc7, 0xdd, + 0x51, 0x9f, 0x65, 0x5e, 0x37, 0xa3, 0xf1, 0xda, 0xc5, 0xad, + 0x2b, 0x93, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x74, 0x73, 0xb9, 0x06, 0xb5, 0x76, 0x7e, 0x6b, 0x7f, 0xb0, + 0xef, 0x45, 0xc3, 0x06, 0x30, 0x8c, 0x41, 0x38, 0xa6, 0x8a, + 0xec, 0xb8, 0x23, 0xef, 0xa0, 0xec, 0x44, 0x5f, 0x61, 0xbc, + 0xa9, 0xe5, 0x94, 0x5a, 0xba, 0x8f, 0x1e, 0xc6, 0xaf, 0x7f, + 0x39, 0xc0, 0xc2, 0x07, 0xd9, 0x84, 0x3f, 0x77, 0x43, 0xb5, + 0xbc, 0xce, 0x03, 0x25, 0x76, 0x4b, 0x98, 0xd0, 0x0e, 0x53, + 0x79, 0x78, 0x81, 0x7c, 0xcc, 0x99, 0x7f, 0x1d, 0x5c, 0x13, + 0x7a, 0xda, 0x43, 0xd3, 0xda, 0x1a, 0x2f, 0xe8, 0x6b, 0x3d, + 0x21, 0xde, 0xf5, 0x92, 0xcc, 0xe1, 0x12, 0xfd, 0xd9, 0x40, + 0xd2, 0x25, 0xee, 0x85, 0x06, 0x13, 0xd7, 0x64, 0xfb, 0xda, + 0xdd, 0xba, 0xb3, 0xfc, 0x7f, 0x17, 0x2f, 0xe9, 0x26, 0xf2, + 0x20, 0x78, 0xcb, 0xfd, 0x6b, 0x01, 0x18, 0x93, 0x79, 0xf4, + 0xb1, 0x6b, 0x4f, 0xdd, 0x06, 0x0f, 0x84, 0x55, 0x6e, 0xa4, + 0xe9, 0x2e, 0x15, 0xe5, 0x36, 0xf6, 0x99, 0x9d, 0x68, 0x0e, + 0xd0, 0xf1, 0xd1, 0xaa, 0x2b, 0x99, 0x89, 0x25, 0x2b, 0xa6, + 0xcc, 0xa8, 0x1d, 0x91, 0x3d, 0x5f, 0x2c, 0xae, 0x22, 0xfc, + 0x1d, 0x03, 0xd2, 0x34, 0xc3, 0x60, 0xa6, 0x8b, 0x25, 0x8b, + 0x21, 0x0b, 0xc1, 0x85, 0x00, 0xf7, 0x7a, 0xaa, 0x60, 0x66, + 0x3a, 0x7a, 0x3c, 0x39, 0xeb, 0x66, 0x5f, 0x39, 0x5f, 0x47, + 0x1f, 0x5d, 0xfb, 0xbf, 0x34, 0xfc, 0x18, 0x0a, 0xd6, 0x3c, + 0xa2, 0xf4, 0x08, 0x1b, 0x5e, 0x48, 0x61, 0x72, 0xa1, 0xb3, + 0x08, 0x55, 0x7a, 0xfa, 0x72, 0xf0, 0x3a, 0xfa, 0x30, 0x3d, + 0xab, 0x26, 0x5f, 0x39, 0x5f, 0x07, 0x1d, 0x5d, 0xed, 0xbf, + 0x34, 0xfc, 0x18, 0x0a, 0xd6, 0x3c, 0xa6, 0xe4, 0x08, 0x0b, + 0x54, 0x48, 0x61, 0x72, 0xa1, 0xb3, 0x08, 0x55, 0x7a, 0xfa, + 0x72, 0xf0, 0xc5, 0x05, 0xcf, 0xc2, 0x54, 0xd9, 0xa0, 0xc6, + 0xa0, 0xf8, 0xe2, 0xa2, 0x12, 0x40, 0xcb, 0x03, 0xe7, 0xf5, + 0x29, 0xc3, 0x59, 0x1b, 0xf7, 0xf4, 0xab, 0xb7, 0x9e, 0x8d, + 0x5e, 0x4c, 0xf7, 0xaa, 0x85, 0x05, 0x8d, 0x0f, 0x14, 0x0b, + 0x10, 0x78, 0x94, 0x13, 0x6f, 0xe0, 0xe2, 0x54, 0xe6, 0x89, + 0xdb, 0x06, 0x47, 0xb6, 0x59, 0x03, 0x86, 0xb5, 0xf0, 0x63, + 0xc8, 0xc1, 0x57, 0xb8, 0x70, 0xd7, 0x43, 0xda, 0xe3, 0x1c, + 0x61, 0xb1, 0xd3, 0xeb, 0xa3, 0x85, 0x5c, 0x88, 0xe0, 0x7e, + 0xca, 0x55, 0x6d, 0x13, 0x9f, 0x04, 0xb5, 0xd5, 0xc3, 0x30, + 0xb0, 0x5e, 0xe4, 0x3e, 0xaf, 0xbc, 0x50, 0xa2, 0x70, 0xcd, + 0x93, 0x3a, 0x19, 0x1e, 0x2b, 0x26, 0xcb, 0x83, 0xcb, 0xb2, + 0x74, 0xc6, 0xa9, 0x1b, 0xca, 0x13, 0x1a, 0x6e, 0x9c, 0x46, + 0x4c, 0x1d, 0x6a, 0xd3, 0x06, 0x83, 0xb6, 0x19, 0x60, 0xc2, + 0x2e, 0xbb, 0x70, 0x4c, 0xd2, 0x4b, 0x34, 0x33, 0xb6, 0xb1, + 0x6a, 0x7b, 0xb5, 0x05, 0x68, 0x0d, 0xf1, 0x93, 0xd9, 0x71, + 0x53, 0x8d, 0x59, 0x87, 0x33, 0xe1, 0xab, 0xec, 0x2f, 0x7f, + 0x04, 0x48, 0x3d, 0xf5, 0xe4, 0x5b, 0x07, 0x84, 0xd3, 0x1e, + 0x77, 0xe8, 0x37, 0x18, 0xcb, 0xf8, 0x60, 0x19, 0xc7, 0x01, + 0x94, 0xc9, 0xa7, 0x22, 0x59, 0x4b, 0x91, 0x13, 0xcd, 0x88, + 0x7d, 0x84, 0xb1, 0x42, 0xa0, 0x21, 0xca, 0x78, 0x52, 0x3c, + 0x4c, 0x7f, 0x16, 0x57, 0xcf, 0x91, 0xe5, 0x7b, 0x52, 0x56, + 0x31, 0xa0, 0x7e, 0x16, 0x9b, 0xcb, 0xea, 0x57, 0xa5, 0x33, + 0x18, 0x28, 0xb5, 0x06, 0x09, 0xd5, 0x7d, 0x90, 0x9c, 0x13, + 0xa8, 0x7b, 0x9a, 0x5e, 0x1d, 0x6d, 0xc5, 0xce, 0x07, 0xd2, + 0x59, 0xfd, 0x65, 0xf9, 0x7b, 0x01, 0xed, 0x0f, 0x4d, 0x36, + 0xd3, 0xce, 0xf0, 0x01, 0xa5, 0x33, 0x58, 0x29, 0x95, 0x06, + 0x09, 0xdd, 0x7d, 0x90, 0x9d, 0x13, 0xa8, 0x31, 0x9a, 0x5c, + 0x1d, 0x2d, 0xc4, 0xcf, 0x07, 0xf2, 0x59, 0xdd, 0x65, 0xf9, + 0x73, 0x11, 0xe9, 0xa3, 0x6f, 0x16, 0xd3, 0xcb, 0xf0, 0x03, + 0x5a, 0xcc, 0xa7, 0xd6, 0x6a, 0xf9, 0xf6, 0x22, 0x82, 0x6f, + 0x62, 0xec, 0x57, 0xce, 0x65, 0xa3, 0xe2, 0xd2, 0x3b, 0x30, + 0xf8, 0x0d, 0xa6, 0x22, 0x9a, 0x06, 0x8c, 0xee, 0x16, 0x5c, + 0x90, 0xe9, 0x2c, 0x34, 0x0f, 0xfc, 0x01, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xfd, 0xac, 0x75, 0x65, 0xb1, 0x2c, + 0x0f, 0xdc, 0x3e, 0x6e, 0xce, 0xd6, 0x91, 0x02, 0x81, 0x8a, + 0x5f, 0xb3, 0xb4, 0xae, 0x73, 0x6d, 0xe5, 0x3e, 0xf5, 0xb5, + 0xe2, 0xa1, 0x35, 0x8f, 0x0f, 0xdf, 0x10, 0x68, 0x8d, 0xc3, + 0x48, 0x7d, 0xae, 0x89, 0x4d, 0xc2, 0x61, 0x2c, 0xfe, 0x69, + 0x4d, 0x59, 0x3e, 0x62, 0xe5, 0x51, 0x72, 0x6d, 0xa1, 0xb9, + 0xa3, 0xe2, 0xed, 0x80, 0xfd, 0xff, 0xdd, 0xb2, 0x4a, 0x4f, + 0xff, 0xb6, 0xab, 0x9e, 0xac, 0x93, 0xf5, 0x8c, 0xf4, 0x69, + 0xa8, 0xdb, 0xed, 0x4e, 0x1f, 0x2f, 0xf3, 0x6c, 0x7e, 0xe5, + 0xd8, 0xa1, 0x81, 0x1c, 0x1a, 0x4e, 0xe3, 0xcf, 0xe0, 0x39, + 0xda, 0x5d, 0xd1, 0x74, 0x65, 0x3b, 0x21, 0x91, 0x7f, 0xd7, + 0x2d, 0x19, 0x84, 0x9c, 0x6d, 0x47, 0x80, 0x9a, 0x28, 0xa2, + 0x60, 0x7c, 0x32, 0x59, 0xb2, 0x18, 0x4d, 0x29, 0x51, 0x14, + 0x2b, 0xab, 0x1f, 0x6c, 0xe2, 0x5e, 0x0b, 0x3d, 0x01, 0x39, + 0x2d, 0xb7, 0x5d, 0xd9, 0xf5, 0xcb, 0x6e, 0x64, 0x6d, 0xed, + 0x32, 0xdd, 0x81, 0x08, 0x87, 0xcf, 0x5c, 0x49, 0xf3, 0xee, + 0xa5, 0x08, 0x13, 0xc6, 0x97, 0x37, 0x22, 0x54, 0x25, 0xc0, + 0x15, 0x42, 0x8a, 0x33, 0x3a, 0x0b, 0x3b, 0xda, 0xc6, 0xd1, + 0x0d, 0x6d, 0x80, 0x6a, 0x0c, 0xac, 0xaa, 0x2f, 0xd6, 0x90, + 0x25, 0xac, 0x5e, 0xf2, 0x73, 0xe2, 0x27, 0x98, 0x01, 0xe2, + 0xd7, 0x16, 0x22, 0x30, 0x23, 0xb5, 0x17, 0x42, 0x8c, 0x73, + 0x0f, 0xd8, 0x33, 0x86, 0xcc, 0x9d, 0x29, 0xa9, 0xfa, 0x7a, + 0x0c, 0xac, 0xaa, 0x2f, 0x96, 0x90, 0x25, 0x8e, 0x5e, 0x71, + 0x73, 0xe2, 0x27, 0x98, 0x01, 0xe2, 0xd7, 0x16, 0x22, 0x30, + 0x23, 0xb5, 0x17, 0x42, 0x8e, 0x73, 0x0f, 0xc8, 0x33, 0x86, + 0xcc, 0xdd, 0x29, 0xa9, 0xea, 0x7a, 0xf3, 0x53, 0x55, 0xd0, + 0x69, 0x6f, 0xda, 0x71, 0xa1, 0x8e, 0x8c, 0x1d, 0xd8, 0x67, + 0xfe, 0x1d, 0x28, 0xe9, 0xdd, 0xcf, 0xdc, 0x4a, 0xe8, 0xbd, + 0x71, 0x8c, 0xf0, 0x37, 0xcc, 0x79, 0x33, 0x22, 0xd6, 0x56, + 0x15, 0x85, 0x7b, 0xd1, 0xb0, 0x41, 0xac, 0x2b, 0xf2, 0x96, + 0x64, 0xca, 0x03, 0xf7, 0x91, 0xa3, 0x2e, 0x39, 0xe3, 0x29, + 0x10, 0x60, 0xa3, 0x96, 0x48, 0x67, 0x3c, 0xd1, 0x45, 0x98, + 0x9f, 0x62, 0x01, 0xa9, 0x63, 0xaa, 0x0c, 0xe4, 0x4c, 0x52, + 0xbc, 0xe9, 0xfe, 0x6c, 0x59, 0x0f, 0x9a, 0x31, 0x72, 0x3f, + 0xd6, 0xba, 0x1b, 0xd8, 0xff, 0x2e, 0xec, 0xeb, 0x82, 0xcb, + 0x57, 0xa5, 0x89, 0x7a, 0xdb, 0x8c, 0x18, 0xff, 0x1e, 0xf8, + 0x47, 0x05, 0x9f, 0x48, 0x07, 0xe8, 0xac, 0x14, 0xcd, 0xcc, + 0xd8, 0xac, 0x21, 0xaf, 0x22, 0x81, 0x2b, 0x81, 0x8a, 0xb3, + 0x51, 0x14, 0xc3, 0x2d, 0xc9, 0xa9, 0x5b, 0xd2, 0xf5, 0x74, + 0x52, 0xeb, 0x4f, 0x99, 0x18, 0x17, 0x87, 0xfa, 0x16, 0x88, + 0xa3, 0xd5, 0x17, 0xe8, 0xd1, 0x52, 0x59, 0x10, 0xb3, 0xa8, + 0xa6, 0xdc, 0x3d, 0x55, 0xa3, 0x89, 0x35, 0x91, 0x3b, 0x3c, + 0x45, 0x32, 0xfe, 0x33, 0xcc, 0x6a, 0x1d, 0x0f, 0x24, 0x21, + 0x7b, 0xab, 0x59, 0x6b, 0x92, 0x38, 0x9b, 0x65, 0x82, 0x8f, + 0xfa, 0x91, 0x5e, 0x2c, 0xeb, 0xb0, 0x89, 0x2f, 0xd3, 0x1e, + 0x06, 0x52, 0x0b, 0xdf, 0x16, 0x66, 0x1f, 0xe2, 0x96, 0xcf, + 0x28, 0x29, 0x0e, 0xad, 0xe2, 0x88, 0x0c, 0x81, 0xa9, 0x69, + 0x8c, 0x0c, 0x8e, 0xf5, 0xae, 0xeb, 0x76, 0x80, 0xf8, 0x86, + 0xbe, 0xa8, 0x24, 0x2f, 0xa3, 0xff, 0x80, 0xea, 0x0d, 0xbf, + 0x9d, 0x6d, 0xdf, 0xa2, 0x73, 0x43, 0x09, 0x44, 0x3b, 0x66, + 0x64, 0x8f, 0x2c, 0xc4, 0x77, 0x69, 0xcc, 0x3d, 0x8f, 0xe5, + 0xae, 0xca, 0x76, 0x80, 0xd8, 0x84, 0xbf, 0xa8, 0xa1, 0x2f, + 0x83, 0xff, 0x82, 0xea, 0x0d, 0x9f, 0x9d, 0x6d, 0xcf, 0xa2, + 0xf3, 0x53, 0x09, 0x60, 0x3f, 0x64, 0x64, 0x8f, 0x2c, 0x84, + 0x7b, 0x69, 0x8c, 0x3c, 0x70, 0x1a, 0x51, 0x35, 0x89, 0x7f, + 0x27, 0x7b, 0x40, 0x57, 0x5e, 0xd0, 0x7c, 0x00, 0x7d, 0x15, + 0xf2, 0x60, 0x62, 0x92, 0x30, 0x5d, 0x0c, 0xac, 0xf6, 0x9f, + 0xc0, 0x9b, 0x9b, 0x70, 0xd3, 0x7b, 0x84, 0x96, 0x73, 0xc3, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x8f, + 0xda, 0xe7, 0x48, 0x5f, 0xd5, 0xda, 0x90, 0x1b, 0x02, 0x32, + 0x10, 0x49, 0xd0, 0x0d, 0xcd, 0xee, 0xd8, 0x87, 0xeb, 0xc0, + 0x98, 0x49, 0x3b, 0x22, 0x39, 0x13, 0x41, 0xb4, 0x1b, 0x84, + 0xdb, 0x75, 0xca, 0xf6, 0xd3, 0x69, 0xe2, 0x6d, 0x10, 0xb7, + 0xca, 0xf3, 0xc3, 0xbc, 0xfa, 0xfb, 0x64, 0x4b, 0x91, 0xc3, + 0x7a, 0xa0, 0xb8, 0xd3, 0x2a, 0x15, 0x5e, 0x5b, 0x48, 0x80, + 0x7f, 0xea, 0xee, 0x96, 0x31, 0x6a, 0x31, 0x60, 0x3d, 0x94, + 0xb3, 0x55, 0x82, 0x3b, 0x77, 0xe3, 0x19, 0xdb, 0x6a, 0x0e, + 0x55, 0x73, 0xba, 0x5f, 0xfb, 0xae, 0xdf, 0x52, 0x7e, 0xd6, + 0xf4, 0xb8, 0x3c, 0x81, 0x60, 0x08, 0xcb, 0xdc, 0x68, 0xb8, + 0xc2, 0xd2, 0x02, 0x67, 0x6f, 0x8a, 0x5f, 0xd1, 0x15, 0x42, + 0x02, 0xc3, 0x9e, 0x35, 0x66, 0xaa, 0x11, 0xe2, 0xfe, 0xca, + 0x46, 0x55, 0x25, 0x3b, 0xfe, 0x9c, 0x2c, 0xa3, 0xd7, 0xe1, + 0x70, 0x77, 0xa3, 0xbb, 0x52, 0xdd, 0x9d, 0xb7, 0x32, 0x4f, + 0x36, 0xd2, 0x91, 0x13, 0x34, 0x9b, 0x06, 0x29, 0xb4, 0x81, + 0xa3, 0xab, 0x24, 0xfd, 0x95, 0x06, 0x7d, 0x25, 0x33, 0x11, + 0x17, 0x98, 0x3d, 0xea, 0xf5, 0xda, 0xe8, 0x40, 0xc8, 0xe7, + 0x3a, 0xe2, 0x1a, 0x26, 0xdf, 0xbe, 0xdb, 0x29, 0x13, 0x33, + 0x70, 0x17, 0x42, 0x2b, 0x70, 0x44, 0x0f, 0xae, 0x24, 0x67, + 0xf5, 0x24, 0x76, 0x6d, 0x06, 0xbb, 0x14, 0x90, 0x58, 0xbe, + 0x55, 0x08, 0xbd, 0x00, 0x8f, 0x89, 0x7f, 0xc6, 0x03, 0x6b, + 0xba, 0x98, 0x35, 0xea, 0x13, 0xb3, 0x30, 0x17, 0x42, 0x2b, + 0x70, 0x45, 0x07, 0xae, 0x24, 0x67, 0xf5, 0x04, 0x76, 0x65, + 0x06, 0x39, 0x16, 0x90, 0x7c, 0xba, 0x55, 0x48, 0xfd, 0x00, + 0x8e, 0xcb, 0x7f, 0xc6, 0x03, 0x63, 0xba, 0x9e, 0x37, 0xea, + 0xec, 0x4c, 0xcf, 0xe8, 0xbd, 0xd4, 0x8f, 0xba, 0xf8, 0x51, + 0xdb, 0x98, 0x0a, 0xfb, 0x89, 0x9a, 0xf9, 0xc6, 0xe9, 0x6f, + 0x83, 0x45, 0xaa, 0xb7, 0x02, 0xff, 0x71, 0x34, 0x80, 0x39, + 0xfc, 0x9c, 0x45, 0x61, 0xc8, 0x15, 0x11, 0x4b, 0xa5, 0xdf, + 0xa7, 0xd1, 0x37, 0xdb, 0x89, 0x22, 0x9d, 0x72, 0xb9, 0x15, + 0xc0, 0xa8, 0xe4, 0xb1, 0x1b, 0x5f, 0xf3, 0xf5, 0x01, 0xd1, + 0x3f, 0x1b, 0x5c, 0x29, 0x4b, 0x59, 0x55, 0x06, 0xfe, 0xf2, + 0xcf, 0x60, 0x8f, 0xf9, 0x49, 0x88, 0xea, 0x3d, 0xbc, 0x86, + 0xcc, 0x70, 0x0f, 0x3f, 0xd3, 0x98, 0x43, 0x32, 0x16, 0x9c, + 0x13, 0x85, 0x4c, 0xcf, 0xff, 0x2f, 0xe4, 0xd4, 0x65, 0x3f, + 0x1a, 0xa2, 0xd5, 0xdc, 0x79, 0x30, 0xb9, 0x89, 0xb0, 0x55, + 0x59, 0xdf, 0xbc, 0xd8, 0x86, 0x63, 0x79, 0x22, 0x4b, 0xa3, + 0x76, 0xf1, 0x96, 0xd9, 0x42, 0x11, 0x4f, 0x6e, 0xac, 0x60, + 0x8a, 0x0d, 0xe2, 0xd3, 0x96, 0xfc, 0xb9, 0xf6, 0x48, 0x51, + 0xb8, 0x5f, 0xe3, 0x86, 0x13, 0x6d, 0x3c, 0x92, 0xdd, 0xb3, + 0xea, 0x49, 0xf0, 0x7c, 0x33, 0x2b, 0x9f, 0x0b, 0x95, 0xc9, + 0x4b, 0xe7, 0x0f, 0x3c, 0x27, 0xff, 0xf5, 0xde, 0xc4, 0xfc, + 0x61, 0x1f, 0xb0, 0x53, 0x6c, 0x9b, 0x0a, 0x87, 0x05, 0xb1, + 0xf7, 0x82, 0x9f, 0x19, 0xec, 0x77, 0x1c, 0xbe, 0x9c, 0x98, + 0x23, 0xeb, 0xbf, 0xf5, 0x2a, 0x8b, 0x44, 0x82, 0x82, 0xa2, + 0x3a, 0x4a, 0xf5, 0xd7, 0x7f, 0xe3, 0xa7, 0xab, 0xd7, 0xa5, + 0xa2, 0x25, 0x4e, 0x49, 0x0c, 0x4a, 0x1b, 0xdc, 0x9f, 0x19, + 0xdc, 0x75, 0x98, 0x98, 0x78, 0x54, 0xab, 0xeb, 0x8f, 0x9e, + 0x7a, 0xea, 0x66, 0xe7, 0xc7, 0xe8, 0x6f, 0xfe, 0x72, 0xdf, + 0xf3, 0x27, 0xa3, 0xeb, 0xee, 0xf6, 0x20, 0xb7, 0x50, 0xe7, + 0x16, 0x19, 0x13, 0xcc, 0x1f, 0x19, 0xdc, 0x35, 0x98, 0x98, + 0xf8, 0x54, 0xa3, 0xeb, 0x9e, 0x9e, 0x7a, 0xca, 0x46, 0xe7, + 0xc7, 0xa8, 0x6f, 0xfe, 0xf6, 0xdf, 0xf3, 0xe7, 0xa3, 0xeb, + 0xc6, 0xf6, 0x20, 0xb7, 0x50, 0x47, 0x16, 0x09, 0xec, 0x33, + 0xe0, 0xe6, 0x23, 0x8a, 0x67, 0x67, 0x07, 0xab, 0x5c, 0x14, + 0x61, 0x61, 0x85, 0x35, 0xb9, 0x18, 0x38, 0x57, 0x90, 0x01, + 0x09, 0x20, 0x0c, 0x18, 0x5c, 0x14, 0x39, 0x09, 0xdf, 0x48, + 0xaf, 0xb8, 0xe9, 0xf6, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x7b, 0x74, 0x95, 0x3b, 0xd3, 0x34, 0x8d, 0x2b, + 0x05, 0x8a, 0x6e, 0x6c, 0x05, 0x69, 0xbe, 0xc1, 0xe7, 0xf0, + 0x22, 0x2f, 0xaa, 0x5c, 0x4c, 0xb6, 0x21, 0x28, 0x43, 0x4c, + 0x31, 0x06, 0x88, 0xcb, 0x47, 0x6d, 0x2d, 0x26, 0x28, 0x37, + 0x81, 0xc1, 0xeb, 0x2a, 0x46, 0x8e, 0x76, 0x0d, 0xa4, 0x4a, + 0xd9, 0x0b, 0xe5, 0x82, 0x98, 0x35, 0xdd, 0xc8, 0x75, 0xec, + 0x8d, 0x35, 0xa4, 0xea, 0x73, 0x48, 0x7f, 0xdf, 0xaf, 0xf2, + 0xa7, 0x50, 0x60, 0x2b, 0x9b, 0x7c, 0xb8, 0xc5, 0xd4, 0x8b, + 0x7e, 0x7e, 0x64, 0x47, 0x8b, 0x99, 0xfa, 0x74, 0xff, 0x36, + 0xfa, 0x97, 0x5f, 0x24, 0x5d, 0x2b, 0x90, 0x72, 0x14, 0xac, + 0x3e, 0xc4, 0xf2, 0xda, 0x37, 0x9b, 0xc0, 0x38, 0xbf, 0xdf, + 0xbe, 0x30, 0xe9, 0x06, 0xd2, 0x8a, 0x30, 0x3c, 0xd7, 0x4f, + 0x99, 0xa3, 0xc5, 0x63, 0x15, 0x42, 0xff, 0x1a, 0x14, 0x6b, + 0xea, 0x5a, 0x1a, 0xeb, 0xa6, 0x8f, 0x8a, 0x5b, 0x21, 0x15, + 0x67, 0xe5, 0x73, 0x2e, 0xb0, 0xb3, 0x7f, 0x52, 0x99, 0xda, + 0x84, 0x93, 0xf3, 0x9b, 0x9b, 0xea, 0x30, 0x16, 0x9d, 0x47, + 0x58, 0xcd, 0x14, 0x52, 0x7e, 0x14, 0x32, 0x56, 0x6b, 0x89, + 0xb6, 0x32, 0x6b, 0xba, 0xc5, 0x56, 0x6c, 0xc5, 0xde, 0x2e, + 0x0f, 0xfd, 0x6d, 0x76, 0x85, 0xba, 0x90, 0x7b, 0xf2, 0x39, + 0xd6, 0x4b, 0x93, 0x34, 0xdf, 0x6f, 0x7e, 0xdd, 0xd2, 0x70, + 0x2e, 0x31, 0x6b, 0x4d, 0x6a, 0xc1, 0xbe, 0x27, 0x2b, 0x90, + 0xa7, 0x7e, 0x39, 0xbd, 0xcd, 0x3a, 0xaf, 0x9d, 0x7f, 0x56, + 0x89, 0xba, 0x90, 0x7a, 0xf2, 0x39, 0xd6, 0x4b, 0x93, 0x34, + 0xdf, 0x67, 0x7e, 0xdd, 0x52, 0x50, 0x2e, 0x31, 0x6b, 0x4c, + 0x6a, 0xc1, 0xbe, 0x23, 0x2b, 0x90, 0xa7, 0x5e, 0x79, 0xb5, + 0xcd, 0x3a, 0xaf, 0xbd, 0x80, 0xa9, 0x76, 0x45, 0x6f, 0x85, + 0x0d, 0xc6, 0x29, 0xb4, 0x6c, 0xcb, 0x20, 0x98, 0x81, 0x22, + 0xad, 0xaf, 0xd1, 0xce, 0x94, 0xb3, 0x95, 0x3e, 0x41, 0xdc, + 0xd4, 0x6f, 0x58, 0xa1, 0x86, 0x4a, 0x32, 0xc5, 0x50, 0x42, + 0xf3, 0x49, 0x33, 0x45, 0x53, 0x14, 0xcf, 0x66, 0xde, 0xa9, + 0x44, 0xe2, 0x96, 0xff, 0xdc, 0xe2, 0x42, 0x25, 0x7e, 0x0f, + 0xb5, 0x4c, 0x16, 0xca, 0x47, 0x40, 0x04, 0x18, 0xf9, 0xb6, + 0x70, 0x56, 0x3c, 0x47, 0xa3, 0xe0, 0x6f, 0x27, 0x18, 0x56, + 0xe5, 0x6d, 0xea, 0x04, 0xfb, 0x04, 0x60, 0x9a, 0xcb, 0xa7, + 0x60, 0xda, 0xaf, 0xe4, 0xd2, 0x10, 0xca, 0xba, 0xd1, 0x4e, + 0x3d, 0xbb, 0x0b, 0xe6, 0x63, 0xce, 0x28, 0x4b, 0x49, 0xe2, + 0xfa, 0x19, 0xcd, 0xd1, 0xed, 0xc6, 0xa7, 0x73, 0x3f, 0xc4, + 0xe3, 0xf0, 0x54, 0xb7, 0xa8, 0x9d, 0xc4, 0x65, 0xf2, 0xcd, + 0x04, 0x49, 0xd7, 0x9b, 0xda, 0x9e, 0xb4, 0xeb, 0xa1, 0xe2, + 0xb5, 0x6a, 0x84, 0x0d, 0xb0, 0xc1, 0xd7, 0x9f, 0x2d, 0x08, + 0xdf, 0x69, 0x7f, 0xc0, 0xcb, 0xca, 0xf0, 0x0c, 0x33, 0x73, + 0x2f, 0xbe, 0xe6, 0xb9, 0x0d, 0xcc, 0x11, 0x9f, 0x86, 0xa6, + 0xbf, 0xa4, 0x68, 0xa4, 0x57, 0x55, 0x80, 0x2a, 0x26, 0x24, + 0x87, 0xf7, 0x55, 0x17, 0x9e, 0x41, 0x95, 0x09, 0xcc, 0x5a, + 0x17, 0xd0, 0x6f, 0xaa, 0xbb, 0x35, 0x22, 0x56, 0xb3, 0x8d, + 0xc9, 0xb6, 0xc5, 0x51, 0x5a, 0x3e, 0x15, 0x22, 0x56, 0x67, + 0x20, 0xf4, 0x09, 0x40, 0x8a, 0x65, 0x37, 0x60, 0xb6, 0x3c, + 0x27, 0x19, 0x55, 0x2d, 0xcc, 0x61, 0xcc, 0xd2, 0x62, 0x19, + 0x72, 0xf3, 0xa2, 0x9a, 0xeb, 0x22, 0x2d, 0x94, 0xd5, 0x19, + 0xd2, 0x7a, 0x55, 0x06, 0xae, 0xad, 0x75, 0xb4, 0xad, 0x60, + 0xb4, 0x52, 0xb5, 0x3d, 0x97, 0x96, 0x0f, 0x51, 0x55, 0x29, + 0xcc, 0x40, 0xcc, 0xd2, 0x62, 0x18, 0x72, 0xb3, 0xa2, 0x9e, + 0xe3, 0x29, 0x2d, 0x94, 0xd5, 0x59, 0xf2, 0x5a, 0x55, 0x06, + 0x2e, 0xa5, 0x75, 0xb4, 0xad, 0x60, 0xb4, 0x42, 0xb5, 0x65, + 0x97, 0x96, 0xf0, 0xae, 0xaa, 0xd6, 0x33, 0xbf, 0x33, 0x2d, + 0x9d, 0xe7, 0x8d, 0x4c, 0x5d, 0x61, 0x1c, 0xd6, 0xd2, 0x6b, + 0x2a, 0xa6, 0x0d, 0xa5, 0xaa, 0xf9, 0xd1, 0x5a, 0x8a, 0x4b, + 0x52, 0x9f, 0x4b, 0xbd, 0x4a, 0x9a, 0x68, 0x69, 0x00, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x73, 0x6c, 0xfd, 0x2c, + 0x58, 0xf8, 0x2a, 0x03, 0x14, 0xff, 0x58, 0xaf, 0x67, 0x3c, + 0x45, 0xe6, 0x67, 0x4a, 0xfe, 0x67, 0x79, 0x72, 0x8a, 0x8d, + 0x19, 0xf7, 0x7b, 0xc9, 0x0c, 0xa3, 0x88, 0xc2, 0x4a, 0x4e, + 0x1b, 0xc7, 0x76, 0xbe, 0xf8, 0xf0, 0x34, 0x5b, 0xbf, 0x88, + 0xef, 0xf8, 0x28, 0xd7, 0x37, 0xf5, 0x15, 0x06, 0x53, 0x06, + 0x0a, 0xbd, 0xa0, 0x30, 0x6e, 0x9e, 0x07, 0x62, 0x64, 0x04, + 0x9f, 0x85, 0x11, 0x9c, 0xa3, 0xfa, 0x63, 0xa9, 0x30, 0xcc, + 0x6f, 0x54, 0x8f, 0x44, 0x2d, 0x16, 0xaa, 0xbe, 0xef, 0x6f, + 0x26, 0x6c, 0x59, 0x00, 0x26, 0x7b, 0x84, 0x38, 0x39, 0x55, + 0xd9, 0xfc, 0xe2, 0xa8, 0x4a, 0xeb, 0x7c, 0xa4, 0xa7, 0xd5, + 0x4e, 0x82, 0xe8, 0x99, 0xbe, 0xe3, 0x7f, 0x14, 0xb3, 0x7a, + 0x89, 0x88, 0x9e, 0x16, 0x7e, 0x22, 0x46, 0xd8, 0x2a, 0xe1, + 0x29, 0x0b, 0x6c, 0xbc, 0xf8, 0x45, 0xb6, 0x95, 0xfd, 0x17, + 0x9e, 0xe8, 0x18, 0xa8, 0x29, 0xa3, 0xb8, 0x96, 0x0b, 0xa9, + 0xca, 0xb2, 0x92, 0xb9, 0xf0, 0x51, 0x95, 0x8c, 0x48, 0x91, + 0x55, 0x81, 0x56, 0x16, 0x5c, 0x0d, 0xae, 0xbc, 0x11, 0x06, + 0xaf, 0x2f, 0x23, 0x9f, 0x83, 0x5f, 0xa0, 0xd0, 0x51, 0xf4, + 0x47, 0x01, 0x0f, 0x09, 0xc0, 0xb8, 0xfe, 0x8b, 0x5b, 0xb4, + 0x50, 0x73, 0x9d, 0x88, 0xd8, 0xb9, 0x5f, 0x42, 0x87, 0x56, + 0x1e, 0x41, 0x99, 0x09, 0x10, 0x12, 0xe4, 0x49, 0xe8, 0x9f, + 0xd7, 0xdc, 0xcd, 0x60, 0x5d, 0xe0, 0x2a, 0x09, 0x0b, 0x30, + 0xcc, 0xb9, 0xee, 0xab, 0x5b, 0xb4, 0x50, 0x73, 0x9d, 0x88, + 0xd8, 0xb9, 0x5f, 0x02, 0x87, 0x56, 0x1e, 0x41, 0x09, 0x09, + 0x14, 0x12, 0xa4, 0x4d, 0xe2, 0x9f, 0xd7, 0xdc, 0xcd, 0xe0, + 0x59, 0xe0, 0x2b, 0x09, 0x0b, 0x31, 0xcc, 0xb9, 0x11, 0x54, + 0xa4, 0x4b, 0xaf, 0x8c, 0x62, 0x77, 0x27, 0x46, 0xa0, 0xfd, + 0x78, 0xa9, 0xe1, 0xbe, 0xf6, 0xf6, 0xeb, 0xed, 0x5b, 0xb2, + 0x1d, 0x60, 0x28, 0x23, 0x32, 0x1f, 0xa6, 0x1f, 0xd4, 0xf6, + 0xf4, 0xce, 0x33, 0x46, 0x45, 0xf5, 0x65, 0xb7, 0x61, 0xa5, + 0x71, 0x0b, 0x11, 0x01, 0x73, 0xd9, 0xb1, 0x1d, 0xa0, 0xb5, + 0xbe, 0x21, 0x9e, 0x05, 0xf1, 0x20, 0x7b, 0x3d, 0x8c, 0x85, + 0x23, 0x3b, 0x5f, 0x98, 0xc6, 0x31, 0xc9, 0x21, 0x91, 0xd7, + 0xb7, 0xf6, 0x87, 0xd0, 0x41, 0xf1, 0x28, 0xbe, 0xf5, 0x1c, + 0x1a, 0x70, 0x69, 0x81, 0x53, 0x69, 0xd8, 0xcf, 0xa9, 0x51, + 0x0d, 0x1d, 0xf8, 0x52, 0x94, 0x22, 0xf5, 0xf7, 0x1c, 0xe5, + 0xcb, 0x62, 0x37, 0x42, 0x1b, 0x9a, 0x61, 0xa2, 0xc8, 0xc5, + 0xe7, 0x62, 0xab, 0x14, 0x6a, 0x9f, 0xfc, 0x73, 0x2c, 0xa3, + 0x41, 0xe0, 0xee, 0x90, 0xed, 0xfd, 0xf0, 0xea, 0x06, 0x2d, + 0x37, 0x8e, 0x8d, 0xfb, 0xfc, 0xe1, 0x21, 0x90, 0x1c, 0x1a, + 0x26, 0x9f, 0x66, 0xde, 0xa0, 0x74, 0x3d, 0xb8, 0x26, 0x82, + 0x62, 0x2a, 0xc8, 0xa0, 0x04, 0x73, 0xe7, 0xd3, 0xa1, 0xfc, + 0x39, 0x58, 0x49, 0x1f, 0xb5, 0x7e, 0x41, 0xd6, 0x58, 0xb8, + 0x45, 0xed, 0x90, 0xea, 0x4d, 0xdf, 0x99, 0x47, 0x08, 0x10, + 0x00, 0xe4, 0x10, 0xfa, 0x67, 0x03, 0x5e, 0xef, 0xcd, 0xfe, + 0xfc, 0x1b, 0x61, 0xb6, 0x2d, 0x24, 0x6e, 0xfe, 0x7b, 0xfc, + 0x10, 0x46, 0xcd, 0x37, 0xc7, 0x02, 0xd8, 0x06, 0x7e, 0xf2, + 0xf1, 0x5c, 0xac, 0x91, 0xf0, 0x14, 0xc8, 0xc4, 0x93, 0x78, + 0x0e, 0x20, 0xa8, 0xbc, 0x3d, 0xee, 0xfc, 0x03, 0x4a, 0xbe, + 0x28, 0x18, 0xbd, 0xa0, 0xff, 0xfe, 0x59, 0x72, 0xcb, 0xb7, + 0x4f, 0x32, 0xf0, 0x34, 0x78, 0xf6, 0x78, 0x58, 0xb0, 0x97, + 0x60, 0x14, 0xc8, 0xc4, 0x13, 0x78, 0x2e, 0x22, 0x78, 0xac, + 0xbd, 0xee, 0xfc, 0x03, 0x4b, 0xbe, 0x29, 0x38, 0x3d, 0xe0, + 0x7f, 0xfe, 0x51, 0x72, 0xc3, 0xb7, 0x4f, 0x32, 0xf0, 0x34, + 0x78, 0xf6, 0x78, 0x58, 0xb0, 0x97, 0x9f, 0xeb, 0x37, 0x3b, + 0xec, 0x87, 0xd1, 0xdd, 0x87, 0x53, 0x42, 0x11, 0x03, 0xfc, + 0xb4, 0x41, 0xd6, 0xc7, 0xc2, 0x1f, 0x80, 0x01, 0xae, 0x8d, + 0x3c, 0x48, 0xb0, 0xcd, 0x0f, 0xcb, 0x87, 0x09, 0x87, 0xa7, + 0x4f, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x43, 0x48, 0x28, 0xff, 0x98, 0x33, 0x51, 0x26, 0x05, 0x65, + 0x55, 0x2f, 0x33, 0xf0, 0x8e, 0x61, 0xe5, 0xc1, 0x91, 0x02, + 0xb2, 0x8e, 0x82, 0x4b, 0xbc, 0xc7, 0xe5, 0x5e, 0x96, 0x25, + 0xe2, 0xeb, 0x5f, 0x79, 0x0f, 0x18, 0x52, 0x8c, 0x3d, 0x42, + 0xf7, 0x97, 0xa9, 0xd8, 0xf0, 0xcd, 0xaf, 0xe5, 0xa7, 0x3a, + 0x8b, 0x93, 0x26, 0x96, 0xc4, 0xb3, 0xd5, 0x2e, 0xbf, 0x9d, + 0x9b, 0xd6, 0x12, 0x60, 0xc9, 0x22, 0x3d, 0x93, 0xe3, 0xa0, + 0x3c, 0x8d, 0x78, 0xf3, 0x99, 0x27, 0x9f, 0xf3, 0xaf, 0xad, + 0x35, 0x4f, 0xd6, 0x5b, 0xf7, 0x78, 0xd5, 0xf5, 0x95, 0x0c, + 0xd5, 0x35, 0xbf, 0x67, 0x0b, 0xc2, 0x20, 0x5f, 0x6d, 0x87, + 0x2f, 0xbf, 0xf7, 0x8e, 0xe4, 0x70, 0x62, 0x2d, 0x1c, 0x45, + 0xbc, 0x04, 0x2d, 0x3f, 0x19, 0x6f, 0x14, 0xf3, 0xb6, 0x72, + 0x7a, 0x5e, 0x99, 0x48, 0x0e, 0xc3, 0x07, 0x2f, 0xcf, 0x91, + 0xc7, 0x21, 0x80, 0x75, 0x00, 0x94, 0xc5, 0xea, 0x5f, 0x27, + 0x97, 0x81, 0xda, 0xa0, 0x13, 0x88, 0x68, 0x0c, 0x82, 0x92, + 0xf0, 0x44, 0xff, 0xfa, 0xef, 0x52, 0xe2, 0x69, 0xe5, 0x9a, + 0x4e, 0x85, 0x31, 0x71, 0xa3, 0xbf, 0xe6, 0x8b, 0xe9, 0x03, + 0x00, 0x99, 0xd9, 0x8b, 0x74, 0xb1, 0x55, 0x8a, 0x75, 0x8f, + 0xb2, 0x8a, 0x70, 0x0c, 0x85, 0x32, 0xda, 0x53, 0xbd, 0xbb, + 0xf0, 0xfa, 0xd0, 0x79, 0xe9, 0x68, 0x56, 0xa5, 0x07, 0x75, + 0x92, 0xbe, 0xa8, 0x99, 0xbb, 0x40, 0x8e, 0x81, 0xcd, 0xf8, + 0xdc, 0x77, 0x1a, 0x30, 0x96, 0xce, 0x12, 0x8a, 0x78, 0x0c, + 0x84, 0xb2, 0xf8, 0x53, 0xbd, 0xfb, 0xf4, 0x7a, 0xf2, 0x79, + 0xe9, 0x68, 0x5e, 0xa5, 0x07, 0x75, 0x83, 0xbf, 0xaa, 0x9b, + 0xbb, 0x43, 0x86, 0x81, 0xcd, 0xf9, 0xdc, 0x77, 0x12, 0x20, + 0x96, 0xcf, 0xed, 0x75, 0x87, 0xf3, 0x7b, 0x4d, 0x07, 0xac, + 0x42, 0x04, 0x0b, 0x85, 0x0d, 0x86, 0x16, 0x97, 0xa1, 0x5a, + 0xf8, 0x8a, 0x7c, 0x40, 0x55, 0x64, 0x44, 0xbc, 0x79, 0x7e, + 0x32, 0x06, 0x23, 0x88, 0xed, 0xdf, 0x69, 0x30, 0x1a, 0xe2, + 0xe7, 0xea, 0x99, 0x16, 0x58, 0x84, 0x8e, 0x2e, 0x60, 0xe4, + 0x63, 0x5b, 0x41, 0x45, 0x47, 0x94, 0x08, 0x23, 0x72, 0xfb, + 0x96, 0x58, 0x29, 0xb6, 0x7c, 0x1e, 0xb1, 0x3b, 0x9f, 0x1b, + 0x2b, 0x28, 0x45, 0x1b, 0xf9, 0x9d, 0xf9, 0xb3, 0xd4, 0x0f, + 0x54, 0x94, 0x9d, 0x8f, 0xc1, 0x5c, 0x7e, 0x71, 0xb4, 0xd0, + 0xb4, 0x60, 0x84, 0xa2, 0x9d, 0x33, 0x32, 0xa9, 0x24, 0x97, + 0x62, 0xd0, 0xc6, 0x38, 0xdb, 0xe9, 0x5d, 0x52, 0xc7, 0xd5, + 0x06, 0x63, 0x1d, 0x18, 0x20, 0x36, 0x30, 0x7b, 0xf1, 0x0c, + 0x7f, 0xad, 0x8d, 0x9a, 0xfa, 0xfa, 0x2e, 0x56, 0x0a, 0xbc, + 0xb2, 0xcf, 0x38, 0x23, 0x1d, 0xf8, 0xb9, 0x12, 0x6e, 0x3f, + 0x44, 0x81, 0xab, 0x59, 0x87, 0x12, 0xe7, 0x9e, 0xfc, 0x9a, + 0x19, 0x97, 0x7c, 0x53, 0xda, 0xe7, 0xd0, 0x32, 0xea, 0xb6, + 0x36, 0x36, 0x84, 0xad, 0x54, 0x76, 0x14, 0x8d, 0x79, 0x15, + 0x2f, 0xe3, 0xae, 0x36, 0xbb, 0xa1, 0x83, 0xb7, 0xc6, 0xaf, + 0x81, 0x9d, 0x7b, 0x53, 0x52, 0xfc, 0x74, 0x8d, 0x2f, 0xb1, + 0x5c, 0x03, 0xa3, 0xe4, 0x1d, 0x06, 0x2d, 0x56, 0x98, 0x36, + 0x38, 0xfa, 0xe6, 0x96, 0x19, 0x7d, 0x27, 0x08, 0xae, 0x91, + 0xc3, 0x1c, 0xcd, 0x2e, 0x32, 0xa6, 0xfb, 0x77, 0x2f, 0x52, + 0xe2, 0xde, 0x13, 0x41, 0x61, 0x33, 0xed, 0xc3, 0xb3, 0x0c, + 0xcd, 0x02, 0x5c, 0xfe, 0x1c, 0x1d, 0xf7, 0x7a, 0x93, 0x8e, + 0xd1, 0x74, 0xa7, 0x58, 0x29, 0x94, 0x86, 0x15, 0x2f, 0x3d, + 0x62, 0x3e, 0xd3, 0x24, 0x2f, 0x52, 0x62, 0xde, 0x11, 0x41, + 0x61, 0x33, 0xdd, 0xc3, 0xb3, 0xac, 0xcd, 0x02, 0x5c, 0xf6, + 0x1c, 0x1d, 0xb0, 0x7a, 0x92, 0x8e, 0x19, 0x74, 0x27, 0x58, + 0xab, 0x94, 0x86, 0x15, 0xaf, 0xbd, 0x62, 0x3e, 0xc3, 0x34, + 0xd0, 0xad, 0x9d, 0x21, 0xee, 0xbe, 0x9e, 0xcc, 0x22, 0x3c, + 0x4c, 0x53, 0x32, 0xfd, 0xa3, 0x09, 0xe3, 0xe2, 0x4f, 0x85, + 0x6d, 0x71, 0xe6, 0x8b, 0xd8, 0xa7, 0x54, 0x6b, 0x79, 0xea, + 0x50, 0x42, 0x9d, 0xc1, 0x3c, 0xcb, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xed, 0x2a, 0x5a, 0x40, 0x7a, 0x0f, + 0xcb, 0x42, 0x80, 0x94, 0xe0, 0x5c, 0x86, 0x3c, 0x5a, 0x0e, + 0x52, 0x63, 0x2b, 0x66, 0xc5, 0x7d, 0x67, 0xb9, 0x1d, 0x88, + 0x03, 0x2f, 0x46, 0x8b, 0x8d, 0x5f, 0x3b, 0x2a, 0xe8, 0xed, + 0x25, 0x8e, 0xf1, 0x6d, 0x7e, 0xa1, 0x26, 0xa1, 0x09, 0x3a, + 0xc0, 0xff, 0x73, 0x3f, 0x61, 0x3d, 0x92, 0x64, 0x1e, 0x4a, + 0xec, 0x0a, 0x75, 0x9f, 0x0e, 0xe2, 0xd7, 0x39, 0x7b, 0x78, + 0xfc, 0xc0, 0x18, 0xf5, 0x57, 0x16, 0x4d, 0x46, 0x6a, 0x3d, + 0xa8, 0x04, 0xd8, 0xaa, 0x45, 0xaa, 0x10, 0xc1, 0xd7, 0x58, + 0x22, 0x79, 0x30, 0x02, 0xb1, 0x3f, 0x01, 0x07, 0xc0, 0x96, + 0x3e, 0x33, 0xd0, 0x27, 0xaf, 0x49, 0x7b, 0xca, 0x3f, 0x1c, + 0x26, 0x6a, 0x69, 0x50, 0xd0, 0x3e, 0x27, 0x0f, 0x6f, 0x50, + 0xd0, 0x79, 0x83, 0x86, 0xa4, 0xdd, 0xbc, 0x7d, 0x03, 0xac, + 0x4e, 0x7d, 0xb6, 0x8e, 0x65, 0x17, 0x43, 0x26, 0x71, 0x53, + 0x4e, 0xc7, 0x93, 0xea, 0xd2, 0xe8, 0x53, 0xa1, 0xfe, 0xc7, + 0x24, 0x66, 0x50, 0xcc, 0xa4, 0x80, 0x05, 0x2a, 0x50, 0x48, + 0xe4, 0xac, 0xf7, 0xa7, 0xc9, 0x0a, 0x55, 0xea, 0x61, 0x1c, + 0x9e, 0xce, 0x61, 0x5b, 0x74, 0xf9, 0x31, 0x4a, 0x94, 0xc3, + 0x58, 0x55, 0x8a, 0xcc, 0xe6, 0x14, 0x52, 0x45, 0xe2, 0xc9, + 0xff, 0x4a, 0x72, 0x7e, 0x93, 0xd8, 0x92, 0xed, 0x67, 0x57, + 0xe7, 0x12, 0x79, 0x62, 0x6f, 0x12, 0x5d, 0x07, 0x6e, 0x08, + 0x72, 0x1b, 0xb2, 0xc9, 0xfd, 0x66, 0x84, 0x5d, 0x9d, 0xfc, + 0xee, 0x14, 0x52, 0x47, 0xe2, 0xcd, 0xef, 0x08, 0x12, 0x7e, + 0x93, 0xd8, 0x86, 0xed, 0x67, 0x57, 0xe7, 0x02, 0x7d, 0x6a, + 0x6f, 0x12, 0x5d, 0x07, 0x62, 0x0a, 0x72, 0x5b, 0xb3, 0xc9, + 0xdd, 0xe6, 0x84, 0x5d, 0x9d, 0xfc, 0x11, 0xeb, 0xad, 0xb8, + 0x1d, 0x32, 0x10, 0xf7, 0xed, 0x81, 0x6c, 0x27, 0x79, 0x12, + 0x98, 0xa8, 0x18, 0xfd, 0x82, 0x95, 0x90, 0xed, 0xa2, 0xf8, + 0x9d, 0xf5, 0x8d, 0xa4, 0x4c, 0x36, 0x22, 0x19, 0x7b, 0xa2, + 0x62, 0x03, 0x09, 0x8c, 0xf2, 0xb7, 0x04, 0x41, 0x19, 0xad, + 0x82, 0x1f, 0x29, 0x91, 0x1e, 0x0e, 0xe5, 0x78, 0xdd, 0x58, + 0x70, 0x01, 0x11, 0x06, 0x6d, 0xb0, 0x5f, 0x14, 0x3a, 0x22, + 0xd8, 0xb0, 0x0b, 0xcc, 0x82, 0x4d, 0x2b, 0x54, 0x19, 0xd5, + 0xd7, 0x6f, 0xc1, 0x0f, 0x01, 0x16, 0x15, 0x8f, 0x49, 0x5f, + 0xe6, 0x26, 0xd4, 0x98, 0xe0, 0x08, 0x48, 0x18, 0x36, 0x53, + 0x8d, 0x1a, 0x07, 0xb6, 0xe7, 0x07, 0x71, 0x81, 0x19, 0x82, + 0xf3, 0xe7, 0x1e, 0x49, 0xd3, 0x19, 0x1e, 0xac, 0x91, 0x7e, + 0x98, 0x24, 0x8e, 0xcc, 0x61, 0x80, 0xa4, 0x9d, 0xd5, 0x0f, + 0x1f, 0x53, 0xaa, 0xad, 0xce, 0x22, 0x8a, 0xf6, 0x0d, 0x93, + 0xcc, 0x21, 0xf2, 0x61, 0xd8, 0x4a, 0x17, 0x84, 0xcd, 0x19, + 0xd1, 0x3a, 0xd0, 0x3a, 0xd0, 0x45, 0x32, 0x43, 0x6b, 0x01, + 0x78, 0xf7, 0xa4, 0xe7, 0x05, 0x12, 0xd2, 0x2a, 0xd4, 0x80, + 0xc5, 0x77, 0x10, 0xbd, 0x15, 0xe7, 0x0c, 0x68, 0x78, 0xef, + 0x64, 0x65, 0x8e, 0x02, 0x02, 0xf0, 0xcb, 0x02, 0xf4, 0x82, + 0x77, 0xda, 0x4e, 0x89, 0x23, 0xd2, 0x1d, 0xbd, 0x60, 0xc1, + 0x71, 0x94, 0xef, 0xbc, 0x2f, 0x13, 0x2b, 0xb4, 0x22, 0xbd, + 0xa6, 0xe8, 0xa3, 0x8c, 0xb9, 0x1c, 0x34, 0x43, 0x8b, 0x11, + 0x3e, 0xdd, 0x0d, 0x03, 0xd0, 0x31, 0x76, 0xfe, 0x00, 0x8c, + 0xa2, 0x3a, 0x7c, 0xc5, 0x45, 0x6d, 0x84, 0x52, 0xf7, 0x34, + 0xaa, 0x22, 0x03, 0x16, 0x23, 0xef, 0x8d, 0xee, 0xc1, 0x8d, + 0x20, 0x6c, 0x14, 0x43, 0x9d, 0x5c, 0xb6, 0xed, 0x49, 0x02, + 0xd0, 0xb3, 0x76, 0xfe, 0x06, 0x8c, 0xa2, 0x3a, 0x7c, 0xc5, + 0x44, 0xed, 0xc4, 0x52, 0xf7, 0x34, 0xaa, 0x02, 0x03, 0x16, + 0x23, 0xaf, 0x8d, 0xee, 0xc1, 0x8d, 0x30, 0x6c, 0x14, 0x43, + 0x8d, 0x5c, 0xb6, 0xdd, 0xb6, 0xfd, 0x2f, 0x4c, 0x89, 0x01, + 0xf9, 0x73, 0x5d, 0xc5, 0x83, 0x3a, 0xbb, 0x12, 0x3b, 0xad, + 0x08, 0xcb, 0x55, 0xfd, 0xfc, 0xe9, 0xdc, 0x50, 0x72, 0x11, + 0x3e, 0x72, 0xcf, 0x93, 0xeb, 0xbc, 0x72, 0xa3, 0x49, 0x22, + 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x13, 0x21, + 0xa0, 0x85, 0xd1, 0x3e, 0xe6, 0x79, 0x69, 0x7e, 0x19, 0x19, + 0x83, 0xd5, 0x6f, 0x58, 0xdc, 0xa4, 0x1e, 0x7f, 0x8a, 0x1e, + 0x02, 0x9e, 0x23, 0x7f, 0xb8, 0x41, 0xdd, 0x32, 0xa9, 0x88, + 0x97, 0xd3, 0x7a, 0xec, 0x32, 0x78, 0x6d, 0x9e, 0x17, 0x3a, + 0x8e, 0x75, 0x79, 0xe0, 0x1d, 0x86, 0x7d, 0x17, 0x8a, 0xbc, + 0xca, 0x8f, 0x2b, 0xcd, 0x8e, 0x6b, 0x44, 0xcb, 0x8f, 0x5b, + 0xf6, 0xe5, 0xf2, 0xee, 0xda, 0xc0, 0x24, 0x2c, 0x72, 0xe9, + 0xff, 0x6c, 0xb3, 0x82, 0x1b, 0x50, 0x0b, 0xee, 0xa9, 0x50, + 0xc4, 0x0e, 0x5f, 0x9b, 0x76, 0xfb, 0x22, 0xe2, 0x55, 0x39, + 0x6e, 0xb5, 0xcc, 0xa3, 0x80, 0x96, 0x6b, 0xac, 0x63, 0x96, + 0x43, 0x9c, 0xe5, 0x49, 0xa9, 0x2e, 0xf6, 0x58, 0x09, 0x70, + 0xed, 0xb2, 0x89, 0x48, 0xc7, 0xed, 0x74, 0xaf, 0x43, 0x37, + 0x79, 0x89, 0x73, 0xe1, 0x86, 0x47, 0x4b, 0x7b, 0xed, 0xd7, + 0x9e, 0xe9, 0xaa, 0x68, 0xc6, 0xaa, 0xde, 0xbe, 0xcd, 0x33, + 0xaf, 0xa3, 0x6f, 0x18, 0x35, 0x3c, 0x8f, 0x0a, 0x1d, 0x14, + 0x92, 0x9b, 0xbe, 0x12, 0x8e, 0xc3, 0x13, 0x00, 0x7c, 0xbd, + 0x60, 0x5e, 0x45, 0x7d, 0xcd, 0x16, 0x77, 0xa5, 0xfd, 0xe1, + 0x78, 0xec, 0xb9, 0x9a, 0x06, 0xc3, 0x27, 0xaa, 0x6b, 0x9a, + 0x64, 0x37, 0x49, 0x70, 0x94, 0x50, 0xfd, 0x99, 0x34, 0xf8, + 0x8b, 0x72, 0x73, 0xd4, 0x50, 0xd0, 0x28, 0x5d, 0x05, 0xb1, + 0xd3, 0x16, 0x97, 0xe5, 0xf9, 0xa1, 0x30, 0x5d, 0xcb, 0x88, + 0xe7, 0x40, 0x20, 0xe9, 0x6b, 0x98, 0x24, 0x34, 0x49, 0x72, + 0x9d, 0x50, 0xff, 0x99, 0x34, 0x78, 0x8b, 0x72, 0x53, 0xd0, + 0x50, 0xd1, 0x28, 0x5c, 0x05, 0x39, 0xc7, 0x16, 0xb7, 0xe5, + 0xf9, 0xe1, 0x78, 0x59, 0xcb, 0x88, 0xe7, 0x41, 0x22, 0xe9, + 0x94, 0x67, 0xdb, 0xcb, 0xb6, 0x8d, 0x62, 0xaf, 0x00, 0x66, + 0xcb, 0x87, 0x74, 0x8d, 0xac, 0x2f, 0xaf, 0x2e, 0xd7, 0xa3, + 0xfa, 0xc6, 0x38, 0xe9, 0x48, 0x1a, 0x06, 0x1e, 0x87, 0xa6, + 0x34, 0x77, 0x18, 0xbe, 0xdd, 0x16, 0xaa, 0xc0, 0x05, 0xc9, + 0x74, 0xd5, 0x2a, 0xb9, 0x7d, 0xea, 0xa1, 0xcc, 0xd2, 0x6e, + 0xb0, 0xd3, 0xb4, 0x15, 0x6f, 0x7f, 0x3c, 0xba, 0x30, 0xe4, + 0x11, 0x5e, 0x15, 0xc4, 0x0d, 0x3e, 0x65, 0x16, 0x58, 0x71, + 0x37, 0xcb, 0x92, 0x21, 0x23, 0xbf, 0xef, 0x18, 0x5b, 0x32, + 0x38, 0xeb, 0x27, 0x2c, 0x72, 0xc7, 0xa0, 0x39, 0x3a, 0x91, + 0x47, 0x91, 0xf7, 0xb3, 0xd4, 0x3f, 0x2e, 0x54, 0xa1, 0x46, + 0xcd, 0x3b, 0x34, 0x6f, 0xd9, 0x7a, 0x64, 0xf0, 0xb9, 0x87, + 0x09, 0xd8, 0x3d, 0x01, 0xc9, 0x36, 0x67, 0x2b, 0x49, 0xe2, + 0xf5, 0xf9, 0xb0, 0xdc, 0xb8, 0x69, 0xda, 0x93, 0xe0, 0x3c, + 0xd0, 0xaa, 0xc2, 0x9b, 0xea, 0x7e, 0xb7, 0x64, 0x46, 0xae, + 0x1a, 0x62, 0x31, 0x33, 0xc7, 0x53, 0xbb, 0x0d, 0x8d, 0x89, + 0x59, 0xda, 0xb5, 0xba, 0x9b, 0xd9, 0xda, 0x99, 0x48, 0x96, + 0x00, 0x1d, 0x53, 0x48, 0x77, 0xe4, 0x58, 0x39, 0xad, 0xa9, + 0x43, 0xe6, 0x5f, 0xc7, 0x88, 0xd8, 0x67, 0x34, 0x00, 0x22, + 0x8a, 0x55, 0xeb, 0x31, 0x8d, 0xb9, 0x2d, 0x31, 0x5d, 0x6b, + 0xc2, 0x7a, 0x38, 0x1d, 0xfb, 0x98, 0x13, 0x67, 0xe2, 0x7d, + 0x78, 0xba, 0x16, 0xab, 0xfb, 0x6e, 0x5f, 0xb7, 0xf8, 0xd4, + 0x4f, 0x59, 0x1b, 0x16, 0xac, 0xbf, 0xcf, 0x59, 0xb0, 0x3b, + 0x3d, 0xd9, 0x43, 0x70, 0xd8, 0x2a, 0x8b, 0xf1, 0x5f, 0x1c, + 0xf4, 0xdd, 0x71, 0x09, 0xae, 0x6d, 0x81, 0x1e, 0xe7, 0x48, + 0xa9, 0xd2, 0x7d, 0xbe, 0x78, 0xa0, 0x5e, 0x99, 0x8b, 0x17, + 0x26, 0x00, 0x8b, 0x59, 0xb0, 0x3b, 0x3d, 0xd9, 0x43, 0x70, + 0xdc, 0x2a, 0x8b, 0xf1, 0x5b, 0x1d, 0xf0, 0xdc, 0x31, 0x09, + 0xae, 0x4d, 0xa1, 0x3a, 0xf6, 0xe8, 0xa9, 0xd2, 0x7d, 0xbe, + 0x78, 0xa4, 0x5e, 0x99, 0x0b, 0x16, 0x24, 0x20, 0x74, 0xa6, + 0x4f, 0xc4, 0xc2, 0x26, 0xbc, 0x8f, 0x23, 0xd5, 0x74, 0x0e, + 0xa4, 0xe2, 0x0f, 0x23, 0xce, 0xf6, 0x51, 0xb2, 0x5e, 0xc5, + 0x09, 0x17, 0x56, 0x2d, 0x82, 0x41, 0x87, 0x5b, 0xa1, 0x66, + 0xf4, 0xe9, 0xdb, 0xdf, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x5c, 0x2d, 0xcc, 0xff, 0x03, 0xa7, 0x5b, 0x18, + 0x1b, 0x43, 0xa3, 0x52, 0x04, 0x96, 0x90, 0x8a, 0x25, 0x56, + 0xa2, 0x7d, 0x61, 0x0f, 0xc2, 0x83, 0x7d, 0x32, 0x9b, 0x3f, + 0x94, 0xb8, 0xdc, 0x89, 0x7c, 0x5a, 0x75, 0xcd, 0x10, 0xef, + 0x9b, 0xb9, 0xe6, 0x46, 0x09, 0x92, 0x7c, 0xa0, 0xe0, 0x5e, + 0xf0, 0x61, 0xab, 0xc8, 0xe2, 0x41, 0x04, 0x7e, 0x2b, 0x26, + 0x7a, 0x33, 0xa2, 0x33, 0xa8, 0x2d, 0xe5, 0x55, 0x5b, 0x1e, + 0xfe, 0x1b, 0xd2, 0xdd, 0xde, 0x3f, 0x98, 0xcd, 0x70, 0x1f, + 0x62, 0x03, 0x94, 0xd6, 0xe0, 0x30, 0xd3, 0xbd, 0x5d, 0xdd, + 0x25, 0xc1, 0x98, 0x12, 0x39, 0xf6, 0x44, 0x2c, 0xdd, 0x42, + 0x89, 0x2f, 0x8d, 0xc9, 0x2b, 0x6f, 0x13, 0xf9, 0x9b, 0x65, + 0xdf, 0x3c, 0x9d, 0x50, 0xe5, 0x3f, 0x5e, 0x51, 0xa5, 0x4b, + 0x5e, 0x02, 0x35, 0xcc, 0x73, 0xf4, 0x91, 0x8f, 0x71, 0x8c, + 0xe0, 0x9c, 0xb6, 0xc4, 0xa9, 0xde, 0x37, 0x3c, 0x8c, 0x56, + 0x40, 0x9f, 0xfc, 0x07, 0x2a, 0xb6, 0x81, 0xf3, 0x2c, 0x02, + 0x7c, 0x1f, 0x60, 0xfe, 0x96, 0x7f, 0x2d, 0xa4, 0x55, 0xae, + 0xc6, 0xf1, 0xfb, 0x47, 0x22, 0x27, 0x0f, 0xf6, 0x40, 0xe5, + 0x37, 0x51, 0xaa, 0x27, 0x9a, 0xfd, 0xf4, 0x5b, 0x64, 0x6c, + 0x6c, 0x0e, 0x8b, 0x83, 0x9c, 0x43, 0xac, 0x0b, 0x7e, 0x71, + 0xb0, 0x79, 0x6a, 0xb3, 0x15, 0xfd, 0x4e, 0xe3, 0x10, 0x42, + 0xba, 0xae, 0xc1, 0xf6, 0x30, 0xc9, 0xf1, 0x3f, 0xaa, 0x16, + 0xcf, 0xb9, 0x90, 0x0b, 0x71, 0x31, 0x35, 0x06, 0x89, 0x83, + 0x9c, 0x43, 0x8c, 0x0f, 0x78, 0x79, 0xb0, 0x7b, 0x6a, 0xb2, + 0x15, 0xaf, 0x46, 0xf3, 0xd0, 0x43, 0x3a, 0xae, 0x41, 0xf6, + 0x30, 0xc1, 0xf1, 0x77, 0xaa, 0x36, 0xcf, 0xb9, 0x90, 0x1b, + 0x71, 0x31, 0x3d, 0x0e, 0x76, 0x7c, 0x63, 0xbc, 0x53, 0xf0, + 0x87, 0x86, 0x4f, 0x84, 0x95, 0x4d, 0xea, 0x50, 0xb9, 0x0c, + 0x2f, 0xbc, 0xc5, 0x51, 0xbe, 0x09, 0xcf, 0x3e, 0x0e, 0x88, + 0x55, 0xc9, 0x30, 0x46, 0x6f, 0xe4, 0x8e, 0xce, 0xc2, 0xf1, + 0x21, 0x92, 0xb3, 0x12, 0x7a, 0xad, 0x8c, 0x5f, 0x0c, 0x90, + 0x8f, 0x40, 0xb9, 0x15, 0xf8, 0x7a, 0x36, 0x2d, 0x59, 0xb5, + 0xfb, 0xee, 0xa9, 0x1c, 0xa1, 0x03, 0xea, 0xf6, 0x9d, 0xa4, + 0x79, 0x8a, 0xb3, 0x20, 0x50, 0x1c, 0x68, 0x85, 0x7d, 0x3d, + 0x84, 0x94, 0x21, 0x1b, 0x3a, 0x20, 0xf3, 0xcd, 0x71, 0x95, + 0x73, 0xa7, 0x78, 0x09, 0x52, 0x9a, 0x7f, 0x22, 0x1f, 0xbf, + 0x85, 0x14, 0xe2, 0x28, 0xf2, 0xf7, 0x67, 0x3a, 0x14, 0xc3, + 0x4f, 0xd0, 0x85, 0x79, 0x63, 0xdc, 0x80, 0x86, 0x53, 0x8f, + 0xf9, 0x2d, 0xc4, 0x85, 0x65, 0xdc, 0xe6, 0x95, 0x88, 0xc8, + 0x38, 0x98, 0xcc, 0x28, 0xfd, 0xac, 0x74, 0x14, 0x91, 0x96, + 0x30, 0x08, 0x26, 0xe8, 0x7f, 0xe8, 0xc6, 0x97, 0xb7, 0xa7, + 0x7e, 0x07, 0xca, 0x63, 0xb1, 0x71, 0x11, 0xb6, 0xe4, 0x27, + 0xa5, 0x5e, 0x69, 0x25, 0x56, 0x72, 0xcd, 0x6b, 0xfc, 0xef, + 0xa0, 0x7a, 0x13, 0xb6, 0x9e, 0xdd, 0xf4, 0x68, 0xd3, 0x4c, + 0xec, 0x73, 0x03, 0x99, 0x3f, 0x05, 0x6c, 0xed, 0x71, 0xe2, + 0x8a, 0x69, 0x9f, 0x92, 0xe4, 0xa1, 0xbe, 0x2e, 0x8e, 0xc0, + 0xc8, 0x1f, 0xf9, 0xe0, 0x7f, 0x17, 0xe3, 0x15, 0x97, 0x28, + 0x53, 0x65, 0x60, 0x98, 0xd0, 0x32, 0xe2, 0x78, 0x78, 0x7b, + 0xfe, 0xcd, 0x66, 0xe5, 0xd8, 0x87, 0xc0, 0xf5, 0xd1, 0x52, + 0xe0, 0x26, 0x91, 0x1a, 0x21, 0xb5, 0x4a, 0x1a, 0x74, 0xaa, + 0x9c, 0x56, 0x03, 0x55, 0x17, 0x1c, 0x9b, 0xc6, 0x70, 0x58, + 0xd9, 0xc3, 0xac, 0x50, 0xe2, 0x3a, 0xbf, 0x8d, 0x6e, 0xe5, + 0xd8, 0xc7, 0x80, 0xe5, 0xd1, 0x52, 0xe0, 0x27, 0xb0, 0x1a, + 0x25, 0xa5, 0x08, 0x1a, 0x7c, 0xe8, 0xdc, 0x56, 0x23, 0x55, + 0x17, 0x1c, 0x9b, 0x46, 0x70, 0x58, 0xd9, 0xc3, 0xac, 0x70, + 0xf2, 0x3b, 0x40, 0x72, 0x91, 0x1a, 0x27, 0x38, 0x7f, 0x1a, + 0x2e, 0xad, 0x1f, 0xd8, 0x4f, 0xe5, 0xda, 0x5a, 0xf7, 0xe5, + 0x83, 0x17, 0x23, 0xa9, 0xdc, 0xaa, 0xe8, 0xe3, 0x64, 0xb9, + 0x8f, 0xa7, 0x26, 0x3c, 0x53, 0x8f, 0x0d, 0xc4, 0x06, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xac, 0x1e, 0xa9, 0xe7, + 0x99, 0xc5, 0x06, 0x78, 0x7f, 0x8d, 0xe2, 0xef, 0xa4, 0x88, + 0xdb, 0x2a, 0x8e, 0xcd, 0xc2, 0x31, 0x58, 0xac, 0xa0, 0x96, + 0xb7, 0xd2, 0x12, 0x40, 0x46, 0x86, 0xcd, 0xc6, 0x51, 0x6d, + 0xd0, 0x52, 0x73, 0x0b, 0xff, 0x9f, 0x07, 0x0a, 0xaa, 0x6b, + 0x58, 0xbd, 0xba, 0xb2, 0x29, 0x62, 0x87, 0xaa, 0xa3, 0x5c, + 0x79, 0x7d, 0x73, 0x97, 0x50, 0x7c, 0x3a, 0x05, 0x67, 0x50, + 0x8d, 0x6a, 0xa1, 0xfc, 0xc0, 0x92, 0x85, 0x19, 0xbb, 0xac, + 0x3b, 0x85, 0x31, 0xe1, 0x53, 0x7c, 0x46, 0xdc, 0xca, 0xe3, + 0x77, 0x89, 0xcf, 0xa7, 0x31, 0x02, 0x5b, 0x50, 0xbd, 0x68, + 0x22, 0xc0, 0x3d, 0x0d, 0xd4, 0x76, 0x55, 0x5c, 0x9a, 0xd0, + 0x9d, 0x7a, 0x6b, 0xf7, 0x7f, 0x7e, 0xfd, 0xaf, 0xff, 0xd8, + 0xeb, 0x25, 0x26, 0xe8, 0xf6, 0x4b, 0xf4, 0x2c, 0x96, 0xf7, + 0xe1, 0xb9, 0xac, 0xe4, 0xaa, 0x17, 0x7a, 0x3f, 0x0e, 0x6a, + 0x2c, 0x3b, 0xfe, 0x37, 0xc6, 0xc5, 0xfc, 0x19, 0xe8, 0x76, + 0xf1, 0xf6, 0xbc, 0x8c, 0xee, 0x52, 0x7c, 0x09, 0xdb, 0xa4, + 0x04, 0x0f, 0xb5, 0x0b, 0x8d, 0x19, 0x0b, 0x0b, 0xd7, 0xa7, + 0xf0, 0x2e, 0x02, 0x0e, 0x1e, 0x4a, 0x47, 0x4d, 0x36, 0xc6, + 0x29, 0xbc, 0x9f, 0xd7, 0x16, 0x73, 0xd3, 0x34, 0x9c, 0xcd, + 0x7d, 0x53, 0xfb, 0x4b, 0x0f, 0x04, 0x0c, 0x07, 0x73, 0x2b, + 0x95, 0x95, 0xa3, 0x0f, 0xde, 0x90, 0xf6, 0x3f, 0x32, 0x6e, + 0x17, 0x5f, 0x8c, 0xed, 0x38, 0x06, 0x39, 0x0d, 0x8d, 0x7c, + 0x85, 0xf1, 0xf3, 0x34, 0xbc, 0xcd, 0x7d, 0x52, 0xf9, 0x4b, + 0x0f, 0x84, 0x04, 0x07, 0x73, 0x2b, 0x95, 0x95, 0xa3, 0x0b, + 0xde, 0x90, 0xf6, 0x3f, 0x32, 0x6e, 0x1f, 0x4b, 0x8c, 0x6d, + 0x3c, 0x06, 0x2d, 0x0d, 0x9d, 0x7e, 0x85, 0x73, 0x0c, 0xcb, + 0x43, 0x32, 0x82, 0xad, 0x06, 0xb4, 0xf0, 0x7b, 0xfb, 0xf8, + 0x8c, 0xd4, 0x6a, 0x6a, 0x5c, 0xf4, 0x21, 0x6f, 0x09, 0xc0, + 0xcd, 0x91, 0xe0, 0xb4, 0x73, 0x92, 0xc3, 0xf9, 0xd2, 0xf2, + 0x62, 0x81, 0x7a, 0x8c, 0x6b, 0x8a, 0xee, 0xf0, 0x45, 0x3f, + 0xeb, 0xf5, 0x64, 0xb0, 0xd1, 0x54, 0x04, 0xe7, 0x16, 0x02, + 0x74, 0xa4, 0x8b, 0xbd, 0x35, 0xa4, 0x33, 0xa1, 0x00, 0x39, + 0x16, 0x16, 0x2a, 0x0f, 0xfa, 0xb6, 0x95, 0x11, 0x71, 0xb5, + 0x34, 0x1b, 0xbe, 0x28, 0x96, 0xa7, 0xe9, 0x64, 0x0c, 0x04, + 0xf2, 0xf4, 0xd6, 0x0e, 0xe7, 0xdd, 0x59, 0xef, 0x79, 0x29, + 0x7c, 0x68, 0x49, 0xf4, 0x66, 0xde, 0x44, 0x93, 0x33, 0xab, + 0xd1, 0x05, 0x83, 0x6e, 0xb0, 0x84, 0xc3, 0xa6, 0xc3, 0x84, + 0x5f, 0xdd, 0xee, 0x1a, 0x65, 0x57, 0xb9, 0xee, 0xfe, 0x82, + 0xf4, 0xd8, 0x8e, 0x8b, 0xbc, 0x84, 0x5e, 0xa1, 0x0c, 0x4f, + 0x40, 0x81, 0xda, 0x2d, 0xf0, 0xd3, 0x1a, 0x87, 0xff, 0x7e, + 0xd0, 0x9f, 0x39, 0xbc, 0x9f, 0x0f, 0x05, 0x06, 0x6e, 0x5b, + 0xc7, 0x4a, 0xfa, 0x89, 0xc3, 0xcf, 0x3d, 0xe2, 0x5f, 0x4f, + 0x80, 0x15, 0x93, 0xa5, 0x22, 0x12, 0x61, 0xae, 0x57, 0xe2, + 0x02, 0x40, 0x07, 0xd4, 0xb8, 0xf9, 0x33, 0x47, 0x1a, 0x42, + 0x4e, 0x84, 0xa2, 0xb0, 0x86, 0xf4, 0x07, 0x96, 0xf5, 0x37, + 0x84, 0x37, 0x2c, 0xa6, 0xc5, 0x0d, 0xf1, 0xf6, 0x56, 0x59, + 0x3a, 0xe6, 0xf6, 0x73, 0xf1, 0x1f, 0x9d, 0xe1, 0x81, 0x1d, + 0x36, 0x42, 0x85, 0x59, 0x2a, 0xee, 0x73, 0xc0, 0xa1, 0x8c, + 0xd7, 0x50, 0x0e, 0x1e, 0x5a, 0x57, 0x8c, 0x33, 0xbd, 0x0c, + 0x1d, 0x0b, 0x03, 0xc4, 0x37, 0xfd, 0xbb, 0x36, 0xea, 0x67, + 0xd9, 0x7c, 0x9f, 0x02, 0x11, 0x04, 0x37, 0x47, 0x89, 0x01, + 0x2a, 0xe6, 0x5b, 0x80, 0xa1, 0x84, 0xd6, 0xd0, 0x0e, 0x1e, + 0xf8, 0x57, 0x8c, 0x37, 0xbd, 0x0c, 0x5d, 0x0f, 0x01, 0xc4, + 0x17, 0xf9, 0xbb, 0x36, 0xe2, 0x67, 0xd9, 0x74, 0x9d, 0x43, + 0x11, 0x04, 0x37, 0x4e, 0x99, 0x01, 0xd5, 0x19, 0xa4, 0x7f, + 0x5e, 0x7b, 0x29, 0x2f, 0xf1, 0xe1, 0x07, 0xa8, 0x73, 0xc8, + 0x42, 0xf3, 0xa2, 0xf0, 0xfe, 0x3b, 0xe8, 0x06, 0x44, 0xc9, + 0x1d, 0x98, 0x26, 0x8b, 0x62, 0xbc, 0xee, 0xfb, 0xc8, 0xb1, + 0x66, 0xfe, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x03, 0xad, 0x91, 0xce, 0xbd, 0x79, 0xce, 0x17, 0xea, 0x0b, + 0x09, 0xf1, 0xf1, 0xb6, 0xab, 0x71, 0x10, 0xb8, 0xb7, 0xae, + 0xf9, 0x51, 0x02, 0x8c, 0xc5, 0x3d, 0xd7, 0x67, 0x02, 0x4f, + 0xcd, 0x2d, 0x06, 0xdc, 0x9d, 0x63, 0x59, 0xbf, 0xe1, 0xfe, + 0xab, 0x2b, 0x85, 0xf5, 0x90, 0x24, 0x89, 0xe1, 0xcb, 0xc9, + 0x91, 0x08, 0x98, 0x58, 0x12, 0xb5, 0xd8, 0xb7, 0x89, 0x14, + 0x23, 0x99, 0x5f, 0x4f, 0x46, 0x14, 0x08, 0xc5, 0xa2, 0xea, + 0xf8, 0xf2, 0x7d, 0xd5, 0xc3, 0x73, 0x7a, 0x4e, 0xe0, 0x46, + 0xaf, 0x98, 0xdd, 0xe2, 0x4a, 0x2d, 0x4f, 0xf5, 0xe6, 0x9b, + 0xfe, 0x13, 0xae, 0xaa, 0x71, 0x9e, 0xdb, 0xe6, 0xcd, 0xe6, + 0x63, 0x45, 0xc5, 0x88, 0x2f, 0x02, 0x72, 0x95, 0xb3, 0x33, + 0xa2, 0xd2, 0x08, 0xa0, 0x92, 0x82, 0x4f, 0x0d, 0x97, 0x21, + 0xa2, 0x1f, 0x8d, 0x14, 0x1a, 0x77, 0x30, 0x76, 0x5e, 0x62, + 0x36, 0x39, 0x94, 0xd3, 0xa0, 0x8a, 0x4a, 0xe4, 0x31, 0x33, + 0xe9, 0xac, 0x5b, 0xb3, 0x2e, 0x9d, 0xce, 0x19, 0xfc, 0xcf, + 0x7f, 0xd6, 0xd6, 0x0b, 0x17, 0x3d, 0x90, 0xea, 0x2b, 0x0b, + 0x00, 0x4f, 0x1b, 0x2e, 0x8c, 0x2f, 0x20, 0xdd, 0x6f, 0xf1, + 0x08, 0x2c, 0xd1, 0x87, 0x8e, 0x5a, 0xab, 0x02, 0xa1, 0x25, + 0x31, 0x1e, 0x8d, 0x75, 0xbd, 0x6d, 0x72, 0xd6, 0x5a, 0x0f, + 0x51, 0x0d, 0xf0, 0x83, 0xef, 0xb1, 0x3a, 0x83, 0x1a, 0x2f, + 0x1e, 0x2a, 0x62, 0x4d, 0x5f, 0xdf, 0x48, 0xb2, 0x34, 0xdf, + 0x14, 0x14, 0x68, 0x6a, 0xe2, 0x87, 0x31, 0x9f, 0xcd, 0x71, + 0xbd, 0x6d, 0x72, 0xd6, 0x5e, 0x0d, 0x55, 0x05, 0x90, 0x83, + 0xaf, 0xb1, 0x2a, 0x0b, 0x1a, 0x2f, 0x1e, 0x2a, 0x22, 0x5d, + 0x7f, 0xff, 0x48, 0xb2, 0x74, 0xc7, 0x14, 0x10, 0x68, 0x0a, + 0xe2, 0x87, 0xce, 0x60, 0x32, 0x8e, 0x42, 0x92, 0x8d, 0x29, + 0xa1, 0xf2, 0xaa, 0xfa, 0x6f, 0x7c, 0x50, 0x4e, 0xd5, 0xf4, + 0xe5, 0xd0, 0xe1, 0xd5, 0xdd, 0xa2, 0x80, 0x00, 0xb7, 0x4d, + 0x8b, 0x38, 0xeb, 0xef, 0x97, 0xf5, 0x1d, 0x78, 0x6f, 0x53, + 0x3a, 0xfe, 0x44, 0x3f, 0x84, 0x8a, 0x82, 0xf2, 0x95, 0x36, + 0x4d, 0xa7, 0x56, 0x12, 0x01, 0xa4, 0x57, 0x40, 0xf6, 0x1f, + 0x88, 0x33, 0xae, 0x10, 0xec, 0xe2, 0x06, 0x5d, 0x0f, 0xb9, + 0x5d, 0xfd, 0x04, 0x19, 0xd4, 0x9b, 0x6f, 0xf8, 0x33, 0x60, + 0xec, 0xdb, 0xd9, 0x57, 0xe0, 0x8f, 0xe5, 0xc4, 0x71, 0x50, + 0x9f, 0x62, 0x54, 0xd3, 0xda, 0x6c, 0xdd, 0x5f, 0x63, 0x39, + 0xff, 0x58, 0xde, 0xa9, 0xbf, 0xfe, 0x0b, 0x23, 0x0e, 0xb9, + 0xd3, 0x8c, 0x3d, 0xf9, 0x78, 0xb1, 0xef, 0xee, 0xb9, 0x8f, + 0xce, 0x78, 0x29, 0x03, 0xe4, 0x8b, 0x0c, 0x7c, 0x3e, 0x25, + 0x96, 0xe5, 0x4e, 0x31, 0xae, 0x18, 0x76, 0xc1, 0xfe, 0x84, + 0xe7, 0x70, 0xfb, 0x2b, 0x89, 0x15, 0x6d, 0xef, 0x28, 0xdd, + 0xd1, 0x58, 0x02, 0xb0, 0x80, 0x39, 0xb5, 0xe9, 0xa6, 0xd0, + 0x7e, 0x27, 0x54, 0xe9, 0xbd, 0xf6, 0xf8, 0x8f, 0x80, 0x29, + 0x88, 0x22, 0xda, 0xde, 0xa7, 0xa4, 0xd2, 0x63, 0x46, 0xca, + 0xd0, 0x95, 0x8e, 0xf3, 0x67, 0xfc, 0x7c, 0xd3, 0x97, 0x5a, + 0x7c, 0x3b, 0xf2, 0xa9, 0x8a, 0x34, 0x64, 0xf1, 0xa0, 0x3c, + 0x66, 0x43, 0x1b, 0xba, 0xb0, 0x8a, 0x1c, 0x68, 0x53, 0x43, + 0xd0, 0x24, 0x25, 0xcc, 0x42, 0x72, 0x49, 0x54, 0xd0, 0x77, + 0xef, 0xfb, 0xc6, 0xd1, 0x93, 0xf0, 0x72, 0x39, 0x1a, 0xab, + 0xb8, 0x74, 0x65, 0x4c, 0x8e, 0x6e, 0x62, 0xe7, 0xd2, 0xe6, + 0x78, 0x2a, 0x88, 0x31, 0x03, 0x62, 0x66, 0x3f, 0x65, 0x68, + 0x42, 0xf3, 0x1a, 0x50, 0xc0, 0x77, 0xef, 0xfb, 0xce, 0xd1, + 0x93, 0xf0, 0x70, 0x39, 0x12, 0xa1, 0xb8, 0x74, 0x65, 0x45, + 0x86, 0x6e, 0x66, 0xe7, 0xd2, 0xe6, 0x78, 0x2a, 0x88, 0x31, + 0x53, 0x62, 0x62, 0x34, 0x65, 0x68, 0x42, 0xf2, 0x1a, 0x54, + 0x3f, 0x88, 0x10, 0x04, 0x31, 0x2e, 0x6c, 0x0f, 0x8f, 0xc6, + 0xed, 0x56, 0x47, 0x8b, 0x9a, 0xba, 0x79, 0x91, 0x99, 0x18, + 0x2d, 0x19, 0x87, 0xd5, 0x77, 0xce, 0xac, 0x9d, 0x9d, 0xcb, + 0x9a, 0x97, 0xbd, 0x0d, 0xe5, 0xab, 0x02, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0xe5, 0x77, 0xae, 0xd6, 0x41, 0x1e, + 0x28, 0x48, 0x6f, 0x6d, 0x57, 0x6e, 0x48, 0x31, 0x03, 0x78, + 0xa6, 0x49, 0x91, 0x1a, 0xf2, 0x2b, 0x28, 0x7c, 0x2a, 0xa5, + 0x1c, 0xd1, 0x1e, 0x0f, 0xe7, 0x40, 0xcc, 0x52, 0xcc, 0xa3, + 0x90, 0x42, 0x99, 0xdc, 0xfe, 0xdd, 0xff, 0xea, 0x35, 0xf8, + 0x60, 0xe0, 0x78, 0xf4, 0x7d, 0x92, 0x2d, 0xbc, 0xc2, 0x1a, + 0x63, 0x59, 0x06, 0xf6, 0x41, 0xdd, 0x9d, 0xb8, 0xcf, 0xfa, + 0xf0, 0xe5, 0xd6, 0x9c, 0x68, 0x52, 0x5d, 0x05, 0x43, 0x36, + 0xa1, 0x1c, 0x39, 0x73, 0xa9, 0xca, 0xb1, 0x3c, 0xbf, 0xca, + 0x0f, 0x18, 0xfd, 0xe4, 0x44, 0x5c, 0xf1, 0x8b, 0xd2, 0xef, + 0xe3, 0x83, 0xb1, 0xc4, 0xb8, 0xf4, 0x21, 0xe8, 0xab, 0x37, + 0x6b, 0x8f, 0xa7, 0x58, 0xe4, 0xca, 0x54, 0x8f, 0x8c, 0x4e, + 0x28, 0x23, 0x95, 0xf4, 0xa5, 0x5b, 0xb2, 0x29, 0x87, 0x06, + 0x30, 0xcd, 0x51, 0xad, 0x2f, 0xd2, 0xf6, 0xe7, 0x0b, 0xa9, + 0xc5, 0x57, 0xe8, 0x36, 0x5d, 0x2d, 0xe0, 0x18, 0x52, 0xfa, + 0x2e, 0x02, 0xf7, 0xf8, 0xd8, 0x76, 0x0e, 0x6e, 0x0d, 0x15, + 0xd4, 0x9c, 0x04, 0xc9, 0x50, 0xbd, 0xed, 0x69, 0x33, 0x41, + 0x67, 0xdd, 0xa2, 0xd9, 0x64, 0x20, 0xb1, 0x60, 0xf5, 0xb1, + 0xd1, 0xe3, 0x94, 0x86, 0x5b, 0xfb, 0x3f, 0x04, 0xfd, 0x7d, + 0x58, 0x72, 0x8f, 0x66, 0x87, 0x07, 0x08, 0x68, 0x60, 0x38, + 0x9f, 0xaf, 0xee, 0x6d, 0xf5, 0x11, 0xeb, 0xcb, 0xba, 0x51, + 0x31, 0x90, 0x8b, 0xb2, 0xd0, 0xbb, 0x57, 0x53, 0x34, 0xda, + 0x53, 0xfb, 0x3f, 0x06, 0xfd, 0x7d, 0x58, 0x72, 0x0f, 0x6e, + 0x85, 0x07, 0x08, 0x68, 0x20, 0x38, 0x97, 0xae, 0x6f, 0x6d, + 0xf1, 0x01, 0xe3, 0xca, 0xba, 0x51, 0x31, 0x90, 0x8d, 0xb2, + 0xd0, 0xbb, 0xd7, 0x53, 0xb4, 0x9a, 0xac, 0x04, 0xc0, 0xf9, + 0x02, 0x82, 0xa7, 0x8d, 0xf0, 0x91, 0x7a, 0xf8, 0xf7, 0x97, + 0xdf, 0xc7, 0x68, 0x51, 0x90, 0x92, 0x0e, 0xfe, 0x1c, 0x35, + 0x45, 0xae, 0xce, 0x6f, 0x72, 0x4d, 0x2f, 0x44, 0x28, 0xac, + 0x4b, 0x65, 0x01, 0x35, 0x1e, 0xf1, 0x97, 0xb5, 0x9d, 0xa3, + 0x4a, 0xe5, 0x7e, 0x81, 0x1a, 0x37, 0xca, 0x9d, 0xb3, 0x10, + 0x96, 0x34, 0x1f, 0xf7, 0x3d, 0x26, 0xb2, 0x7d, 0x02, 0x8c, + 0x75, 0xd7, 0xb6, 0x19, 0xaf, 0x81, 0xa9, 0x4d, 0xe9, 0xbc, + 0xe6, 0x04, 0x48, 0x74, 0x67, 0x72, 0xe0, 0x93, 0x99, 0x05, + 0x55, 0x3f, 0xdb, 0xeb, 0x87, 0x17, 0x04, 0xc1, 0x5b, 0xd6, + 0xb8, 0xd7, 0x7b, 0x7f, 0xf6, 0x26, 0xeb, 0xb4, 0x23, 0x40, + 0xe4, 0x25, 0x48, 0xfb, 0x44, 0xbd, 0xc0, 0x71, 0x9e, 0xe8, + 0x93, 0x50, 0x80, 0x98, 0xcd, 0x52, 0xf3, 0x62, 0x2a, 0x92, + 0x1e, 0x67, 0x79, 0x8e, 0x4b, 0xd4, 0x02, 0xab, 0xa0, 0x48, + 0xda, 0x69, 0xf2, 0x35, 0x1d, 0xda, 0x02, 0x39, 0xaf, 0x76, + 0xe9, 0xda, 0xfa, 0x22, 0x0b, 0xfb, 0x7e, 0x5b, 0xce, 0x7d, + 0x46, 0xc0, 0x59, 0x18, 0x71, 0x36, 0x34, 0xea, 0xcb, 0x5f, + 0x86, 0x0c, 0xea, 0xff, 0x9b, 0x47, 0xd2, 0xf5, 0x05, 0x21, + 0x73, 0xe2, 0xdf, 0x92, 0x58, 0x51, 0x2a, 0x8e, 0x69, 0xd7, + 0xa6, 0xac, 0x81, 0xb0, 0xa5, 0xe8, 0x62, 0x32, 0xc0, 0xbe, + 0x84, 0x09, 0xac, 0x84, 0x75, 0xbe, 0xbc, 0x9c, 0x67, 0x98, + 0x6c, 0xc0, 0xd6, 0xa1, 0x38, 0x74, 0xab, 0xcd, 0x1e, 0xcf, + 0x5a, 0x02, 0x60, 0x45, 0x81, 0x40, 0x8e, 0x6c, 0xdb, 0x1c, + 0x10, 0xea, 0x27, 0x32, 0xc0, 0x9e, 0x45, 0x13, 0x74, 0x43, + 0x0f, 0xbe, 0xa8, 0x9c, 0x62, 0x98, 0x6d, 0x06, 0xda, 0xe6, + 0xf9, 0x75, 0x0a, 0x4c, 0x5f, 0x87, 0xce, 0x02, 0x60, 0x0f, + 0xc1, 0x42, 0x8e, 0xec, 0xdb, 0x38, 0x94, 0xe8, 0x26, 0x30, + 0xc0, 0x9e, 0x45, 0x13, 0x74, 0x43, 0x4f, 0xbe, 0xa8, 0x9c, + 0x62, 0x98, 0x6c, 0x06, 0xda, 0xa4, 0xf9, 0x75, 0x0a, 0x4c, + 0x5f, 0x87, 0x4e, 0x02, 0x9f, 0xf0, 0x3e, 0xbd, 0x71, 0x13, + 0x24, 0xc7, 0x6b, 0x17, 0xd9, 0xcf, 0x3f, 0x61, 0xba, 0xec, + 0x8b, 0xbc, 0xb0, 0x41, 0x57, 0x63, 0x9d, 0x67, 0x93, 0xf9, + 0x25, 0x5b, 0x06, 0x8a, 0xf5, 0xb3, 0xa0, 0x78, 0xb1, 0xfd, + 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x98, 0xf7, + 0x0d, 0xd2, 0x0d, 0x3f, 0x4e, 0x78, 0xf9, 0x4b, 0x87, 0xe3, + 0x44, 0xb1, 0x82, 0x58, 0x68, 0x8c, 0x3a, 0xc1, 0x10, 0x7a, + 0x7f, 0x83, 0xa0, 0x9e, 0x69, 0xf5, 0x5a, 0x88, 0x16, 0x72, + 0x68, 0x29, 0x6f, 0x44, 0x72, 0x7b, 0x1e, 0x93, 0x0b, 0x39, + 0x43, 0xfa, 0x25, 0xa6, 0x57, 0x55, 0x84, 0x08, 0xd5, 0x31, + 0x4a, 0xc5, 0x59, 0xdf, 0x99, 0x59, 0xc8, 0x61, 0x45, 0x5a, + 0xc7, 0x7d, 0xea, 0xdb, 0x36, 0x80, 0x54, 0xa7, 0x49, 0x5b, + 0xdc, 0x92, 0xca, 0x77, 0x48, 0x1d, 0x27, 0xd5, 0x99, 0xee, + 0x9a, 0x0c, 0x7e, 0xb4, 0x60, 0x0a, 0xaf, 0x17, 0x49, 0xa9, + 0x7e, 0xce, 0xab, 0x39, 0x39, 0x91, 0x87, 0x78, 0x8d, 0x8c, + 0x31, 0x02, 0x18, 0x8b, 0x54, 0xb9, 0x2f, 0x29, 0x3d, 0xcd, + 0x35, 0x13, 0xa8, 0x20, 0x88, 0xce, 0xff, 0x2e, 0x9c, 0xc1, + 0x34, 0xc0, 0x65, 0x1d, 0x61, 0xc4, 0x2c, 0xa9, 0x23, 0x93, + 0xa6, 0xf0, 0x6f, 0xad, 0x5d, 0xeb, 0x74, 0x27, 0xac, 0x64, + 0xfd, 0x1d, 0x88, 0x89, 0xf7, 0x86, 0x2a, 0x17, 0x5d, 0xf3, + 0x06, 0x1a, 0xa3, 0xa3, 0x28, 0x9b, 0xb9, 0xae, 0x36, 0x46, + 0xf1, 0x75, 0xa0, 0x4c, 0xa3, 0x1c, 0x7d, 0x33, 0x69, 0x23, + 0xc1, 0xe1, 0xb3, 0x23, 0xdf, 0x0a, 0xad, 0x54, 0x34, 0xcb, + 0xb5, 0x91, 0x6b, 0x93, 0xbd, 0x57, 0x84, 0x6b, 0x26, 0xf7, + 0xfc, 0x99, 0x78, 0x87, 0xb7, 0x2e, 0x63, 0xe6, 0xf4, 0xcc, + 0xa3, 0x10, 0x75, 0x1b, 0x67, 0x43, 0xc6, 0xe1, 0xa7, 0x32, + 0xfc, 0x10, 0xab, 0x5c, 0x24, 0x8b, 0xbf, 0x95, 0x6b, 0x93, + 0xbd, 0x57, 0x84, 0x6b, 0xa7, 0xa7, 0xfc, 0x99, 0x58, 0x8e, + 0xb7, 0x0e, 0x63, 0xe4, 0xf0, 0xcc, 0xa3, 0x10, 0x75, 0x13, + 0x67, 0x63, 0xc6, 0xe9, 0xb7, 0x32, 0xfc, 0x02, 0xab, 0x5c, + 0xdb, 0x74, 0x40, 0x6a, 0x94, 0x6c, 0x42, 0xa8, 0x7b, 0x94, + 0x58, 0x58, 0x03, 0x66, 0xa7, 0x71, 0x48, 0xf1, 0x9c, 0x1b, + 0x0f, 0x33, 0x5c, 0xef, 0x8a, 0xec, 0x98, 0x9c, 0x39, 0x16, + 0x48, 0xcd, 0x03, 0xfd, 0x54, 0xa3, 0x60, 0x88, 0xe3, 0x0b, + 0x99, 0x43, 0xc0, 0x55, 0x98, 0x74, 0xd2, 0x8d, 0xbb, 0x82, + 0x99, 0x0b, 0xa9, 0x1e, 0xbf, 0x1c, 0xff, 0x1a, 0x09, 0xaf, + 0x5b, 0xbf, 0xc9, 0x4d, 0x10, 0xaf, 0x90, 0x1a, 0x18, 0x41, + 0xc9, 0x59, 0xce, 0xb2, 0x76, 0x43, 0x5d, 0x59, 0xfa, 0x1f, + 0x4c, 0x22, 0xec, 0x98, 0x1f, 0x68, 0x39, 0xea, 0x11, 0x6b, + 0xd1, 0xd0, 0x88, 0x59, 0xa7, 0x16, 0xb7, 0x4e, 0x13, 0x18, + 0x1b, 0xc9, 0xc2, 0xb5, 0xb0, 0x1d, 0xde, 0x51, 0xf3, 0xca, + 0x83, 0x51, 0x42, 0xc2, 0x90, 0x1d, 0xd3, 0x5b, 0x58, 0x41, + 0x92, 0xea, 0x8c, 0x84, 0xc9, 0x9d, 0x2c, 0x15, 0x44, 0x7e, + 0x67, 0xb9, 0xc2, 0x9f, 0xac, 0x35, 0xe5, 0x21, 0xbc, 0x7a, + 0x97, 0xe3, 0x79, 0x47, 0x54, 0x97, 0x65, 0x06, 0x96, 0xfa, + 0x73, 0xd3, 0xca, 0x49, 0x74, 0x28, 0x9e, 0xb0, 0xc3, 0x4f, + 0x5a, 0xc1, 0x22, 0x8f, 0x93, 0x1d, 0xd5, 0x9c, 0x52, 0x01, + 0xce, 0x8f, 0x8e, 0x40, 0xb4, 0x0b, 0xe2, 0x48, 0x6c, 0x91, + 0x3e, 0xd8, 0x76, 0xe3, 0x21, 0x12, 0xc0, 0xfe, 0xd0, 0xe5, + 0xc4, 0x16, 0x23, 0x90, 0x89, 0x93, 0xcd, 0xc5, 0xe0, 0xf3, + 0x39, 0x44, 0x00, 0xe8, 0xe1, 0x19, 0xc1, 0xc3, 0x4e, 0xfc, + 0x22, 0xfe, 0x6f, 0x7c, 0xf6, 0x91, 0x49, 0xd9, 0x37, 0xd3, + 0x21, 0x70, 0xfa, 0x0a, 0x50, 0xfc, 0xe4, 0x16, 0x82, 0x90, + 0xb5, 0x9f, 0xc8, 0x41, 0x21, 0xbf, 0xbf, 0x04, 0x04, 0x79, + 0x61, 0x1c, 0x0f, 0xc3, 0xcd, 0xbd, 0xb2, 0xe5, 0xc2, 0x7a, + 0x31, 0x03, 0x49, 0xd1, 0x37, 0xd3, 0x21, 0x72, 0xf2, 0x0a, + 0x52, 0xfd, 0xe4, 0x16, 0x82, 0x90, 0x95, 0x9f, 0xc8, 0x41, + 0x21, 0xbf, 0xbf, 0x44, 0x04, 0xf9, 0x61, 0x1d, 0x0f, 0xc3, + 0xcd, 0xbd, 0xb2, 0xe5, 0xe2, 0x78, 0x35, 0x03, 0xb6, 0x2e, + 0xc8, 0x2c, 0xde, 0x8d, 0x0d, 0xf5, 0xad, 0x02, 0x1b, 0xe9, + 0x7d, 0x6f, 0x6a, 0x60, 0x37, 0xbe, 0xde, 0x40, 0x40, 0xbb, + 0xfb, 0x06, 0x9e, 0xe2, 0xf0, 0x3c, 0x32, 0x42, 0x4d, 0x1a, + 0x1d, 0x87, 0xca, 0xfc, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x93, 0x95, 0x7e, 0xa4, 0x96, 0xce, 0x6b, 0x84, + 0x5f, 0x8f, 0xd6, 0x62, 0x1c, 0x75, 0xd9, 0x0f, 0x4f, 0xe8, + 0x67, 0xbe, 0xf4, 0xa7, 0xd9, 0xf9, 0x69, 0x68, 0xff, 0x61, + 0x0f, 0x86, 0x93, 0x8c, 0xdf, 0x2e, 0xbd, 0xd4, 0x73, 0x04, + 0x68, 0x85, 0x2e, 0xe4, 0xff, 0x6c, 0x95, 0x04, 0xab, 0x8e, + 0x6d, 0xf6, 0x08, 0xdf, 0x90, 0xed, 0xfe, 0xf4, 0x44, 0x7d, + 0xed, 0x8c, 0xaf, 0x85, 0x02, 0x63, 0xb0, 0xc3, 0xff, 0x97, + 0xbe, 0x31, 0x20, 0x0c, 0x48, 0x08, 0x2e, 0x28, 0xa0, 0xd8, + 0x5d, 0xb8, 0xea, 0xaa, 0x00, 0x25, 0x7d, 0xf2, 0x06, 0xe1, + 0x0f, 0x5e, 0xfe, 0x68, 0x27, 0xcc, 0x9e, 0xb4, 0x65, 0x14, + 0x1e, 0xd1, 0x52, 0x11, 0xdd, 0x4e, 0x7b, 0x3f, 0xb1, 0xaf, + 0xc6, 0x5d, 0x01, 0x8a, 0x82, 0x5c, 0xd0, 0xb0, 0x55, 0x95, + 0xbc, 0x7b, 0x09, 0xa7, 0x63, 0x9d, 0xdd, 0x98, 0x0c, 0x0d, + 0xb3, 0x07, 0x63, 0x08, 0x8c, 0xd6, 0xc5, 0x36, 0xe4, 0x05, + 0x96, 0x90, 0xaf, 0x2e, 0xde, 0xe3, 0x8e, 0xec, 0xfd, 0x5f, + 0x46, 0xfa, 0x2a, 0x92, 0x4f, 0x41, 0x50, 0xcc, 0xcc, 0xa9, + 0x34, 0xb3, 0xa1, 0x7c, 0xd4, 0x7f, 0x51, 0xc4, 0x71, 0x63, + 0xf1, 0x5e, 0x4c, 0xf7, 0xe1, 0x0d, 0xe6, 0x25, 0x9d, 0x4b, + 0x6b, 0xc6, 0x4c, 0x23, 0x6f, 0x09, 0x04, 0x1a, 0x3e, 0x90, + 0x47, 0xe7, 0xdd, 0xcc, 0x8c, 0xc7, 0x16, 0xa7, 0x27, 0x18, + 0x9c, 0x6f, 0x71, 0xfc, 0x79, 0x69, 0x81, 0xc7, 0xde, 0x2f, + 0xaa, 0x44, 0x9d, 0xd8, 0x9c, 0xae, 0x1a, 0x46, 0x4e, 0x21, + 0x6d, 0x09, 0x06, 0x5a, 0x7e, 0x90, 0x47, 0xe7, 0xdc, 0xcc, + 0x8c, 0xc7, 0x16, 0xa7, 0x27, 0x38, 0x9c, 0x6f, 0x71, 0xfc, + 0x59, 0x69, 0x81, 0xc7, 0xce, 0x27, 0xe9, 0x45, 0x95, 0xd8, + 0x9c, 0x2e, 0x1b, 0xc6, 0xb1, 0xde, 0x92, 0xf6, 0xf9, 0xa5, + 0x81, 0x6f, 0xb8, 0x18, 0x23, 0x33, 0x73, 0x38, 0xe9, 0x58, + 0xd8, 0xc7, 0x63, 0x90, 0x8e, 0x03, 0xa6, 0x96, 0x7e, 0x38, + 0x31, 0xd8, 0x16, 0xba, 0x6a, 0x27, 0x63, 0xd1, 0xe4, 0x39, + 0xa5, 0xd1, 0x26, 0xe4, 0xca, 0x7a, 0x4f, 0x67, 0x90, 0xd2, + 0xb7, 0xfb, 0x20, 0xb7, 0x9d, 0xeb, 0xd4, 0xe8, 0x94, 0x13, + 0x7a, 0x9a, 0xb6, 0x93, 0x8c, 0x02, 0x39, 0xa1, 0x0b, 0x6b, + 0x1f, 0x1d, 0x33, 0xeb, 0x64, 0x00, 0xa6, 0x62, 0x12, 0xa2, + 0xd0, 0xd8, 0xeb, 0x9e, 0x7b, 0x50, 0xf8, 0x6f, 0xbc, 0x0e, + 0x50, 0xa4, 0xf9, 0x7d, 0x41, 0xf3, 0xbb, 0x2c, 0x5d, 0xd7, + 0x72, 0x39, 0x25, 0x12, 0xff, 0x33, 0x0c, 0x9b, 0x9c, 0x8a, + 0x68, 0xd5, 0xfe, 0x46, 0x62, 0xf1, 0x32, 0x94, 0x3d, 0xce, + 0xc0, 0x33, 0xfb, 0x3c, 0x61, 0xe0, 0xaa, 0x3c, 0xb5, 0x77, + 0x3c, 0x96, 0x47, 0x46, 0xa0, 0x6f, 0x9e, 0x68, 0xf2, 0x4a, + 0x53, 0xd5, 0x82, 0xf0, 0x4e, 0x2e, 0x4e, 0x55, 0x1b, 0x89, + 0xaf, 0x00, 0x06, 0xdd, 0x91, 0x56, 0xf8, 0x3a, 0x4e, 0xbf, + 0xa8, 0x08, 0x20, 0x0a, 0xe5, 0x33, 0x9f, 0xe2, 0x13, 0xc3, + 0x9e, 0xa4, 0x1d, 0x19, 0x8c, 0x97, 0x6b, 0xae, 0xba, 0x14, + 0x9f, 0x0c, 0xf4, 0x41, 0xe7, 0x99, 0x3d, 0xde, 0xdb, 0x56, + 0x92, 0x68, 0x5e, 0x8c, 0xde, 0x97, 0xe7, 0x4c, 0x13, 0x25, + 0xae, 0xc6, 0x38, 0x9e, 0xb6, 0xa3, 0xe4, 0x8d, 0xa2, 0x77, + 0xee, 0xd8, 0xd0, 0x54, 0x20, 0xd4, 0x6a, 0x3d, 0xbd, 0xb9, + 0xae, 0x13, 0xb5, 0x4e, 0x93, 0x66, 0xd0, 0xed, 0x7d, 0x1c, + 0x5c, 0xb5, 0xe3, 0x8c, 0x26, 0x77, 0x98, 0x97, 0x1c, 0x83, + 0x8d, 0x43, 0xbe, 0x35, 0x96, 0x2a, 0xbf, 0xfc, 0x10, 0x94, + 0x02, 0xa0, 0x13, 0x10, 0xb5, 0xf5, 0xa7, 0x13, 0xb5, 0x4e, + 0x93, 0x72, 0x90, 0xed, 0x7c, 0x1c, 0x5e, 0xb7, 0xa3, 0x4c, + 0x26, 0x77, 0x88, 0x87, 0x1c, 0x83, 0x8d, 0x43, 0xae, 0xa5, + 0x9e, 0x2a, 0xbf, 0xdc, 0x10, 0x94, 0x32, 0xb0, 0x13, 0x10, + 0xb5, 0xf1, 0x58, 0xec, 0x4a, 0xb1, 0x6c, 0x8d, 0x6f, 0x12, + 0x83, 0xe3, 0xa1, 0x48, 0x5c, 0xb3, 0xd9, 0x88, 0x77, 0x78, + 0xe3, 0x7c, 0x72, 0xbc, 0x51, 0x5a, 0x61, 0xd5, 0x40, 0x23, + 0xef, 0x6b, 0xcd, 0x4f, 0xec, 0xef, 0x4a, 0x0e, 0x02, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2a, 0xbf, 0x90, 0xb2, + 0xd0, 0x0d, 0x6d, 0x55, 0x3d, 0x93, 0x08, 0x8e, 0xde, 0xf2, + 0x8a, 0x38, 0xda, 0x6c, 0x3f, 0x12, 0xfe, 0x62, 0xb9, 0xa9, + 0xa4, 0x1e, 0x6a, 0x23, 0xfc, 0x16, 0xf0, 0x37, 0xb0, 0xe4, + 0xeb, 0x72, 0xa8, 0xa0, 0xc7, 0x7e, 0x0e, 0xa3, 0xae, 0x25, + 0xff, 0xee, 0xdb, 0x54, 0x83, 0x82, 0xd2, 0x9c, 0xbf, 0x2e, + 0x78, 0xc4, 0x32, 0x08, 0xf4, 0x60, 0x88, 0xa6, 0xcd, 0x1f, + 0x91, 0x50, 0x71, 0x26, 0x9a, 0x9a, 0x98, 0x2a, 0xc6, 0x53, + 0x15, 0xc9, 0xcc, 0x47, 0x3b, 0x64, 0xd6, 0x21, 0xe3, 0x9b, + 0x13, 0x27, 0x81, 0x43, 0x86, 0xc3, 0xe8, 0xd2, 0x62, 0xfb, + 0xae, 0x5b, 0xc5, 0x5b, 0x55, 0x45, 0xa5, 0x64, 0xb2, 0x8c, + 0xf8, 0x9c, 0x78, 0x51, 0x7e, 0x95, 0x39, 0x7d, 0x8f, 0x95, + 0x10, 0x4b, 0xa3, 0xba, 0x45, 0x0b, 0xc4, 0x1f, 0x4f, 0xf0, + 0x89, 0x75, 0x97, 0x1a, 0x16, 0x5a, 0x1d, 0x21, 0xc8, 0xb1, + 0x24, 0xd7, 0x9d, 0xe8, 0x1e, 0x99, 0x32, 0xfa, 0xed, 0x16, + 0xc3, 0x97, 0x5d, 0x64, 0xbf, 0x16, 0xdb, 0x60, 0x1e, 0x58, + 0x62, 0xc3, 0xfb, 0x17, 0x09, 0xb9, 0x68, 0x58, 0xee, 0x75, + 0xfd, 0x98, 0x74, 0xea, 0x1e, 0x38, 0x66, 0x5a, 0x1f, 0xd9, + 0x19, 0x43, 0x48, 0xdc, 0x98, 0x66, 0x02, 0xd7, 0xfc, 0x47, + 0x0f, 0x06, 0xd6, 0xee, 0x26, 0x9a, 0xe1, 0x01, 0x1b, 0x13, + 0x47, 0xc3, 0xf0, 0x42, 0xa4, 0x10, 0xbd, 0x7c, 0x74, 0x4e, + 0x15, 0xf1, 0x45, 0x5f, 0x8f, 0x89, 0xc5, 0x13, 0x48, 0xd9, + 0x53, 0x26, 0x82, 0xd7, 0xfc, 0x65, 0x0f, 0x06, 0xd6, 0x6a, + 0x26, 0x98, 0xe1, 0x01, 0x1b, 0x13, 0x4f, 0xd3, 0xf0, 0x42, + 0xa4, 0x10, 0xbd, 0x7c, 0x74, 0x4e, 0x14, 0xf1, 0x45, 0x5f, + 0x8f, 0x89, 0xc5, 0x13, 0x48, 0xd8, 0x5a, 0x26, 0x7d, 0x28, + 0x03, 0x9a, 0xf0, 0xf9, 0x29, 0x95, 0xd9, 0x67, 0x1e, 0xfe, + 0xe4, 0xec, 0xb0, 0x2c, 0x0f, 0xbd, 0x5b, 0xef, 0x42, 0x83, + 0x8b, 0xb1, 0xeb, 0x0e, 0xba, 0xa0, 0x70, 0x76, 0x3a, 0xec, + 0xb7, 0x27, 0xa5, 0xd9, 0x8b, 0x03, 0xb2, 0xa5, 0x06, 0x4e, + 0x04, 0x58, 0x73, 0x6b, 0x76, 0x95, 0xe1, 0x26, 0x04, 0xd1, + 0x68, 0x1d, 0x52, 0x58, 0x2b, 0x67, 0x02, 0xcb, 0x77, 0xc3, + 0x62, 0x17, 0xc7, 0x85, 0x2e, 0xc3, 0x09, 0x60, 0x78, 0x56, + 0x3d, 0x31, 0xfd, 0x95, 0xd1, 0x36, 0x78, 0x01, 0x1d, 0x1c, + 0xda, 0x0b, 0xa3, 0x4f, 0x7f, 0xe2, 0x0b, 0xaa, 0x96, 0x81, + 0xa0, 0x62, 0x0d, 0x7a, 0x09, 0xd2, 0x9e, 0x41, 0xb1, 0x01, + 0xa8, 0x71, 0x50, 0x85, 0x4c, 0xf5, 0x75, 0xdf, 0xd0, 0x16, + 0x1d, 0x14, 0x3f, 0x0f, 0x76, 0x9c, 0xd3, 0x76, 0xdc, 0x31, + 0xb5, 0x32, 0x22, 0xf3, 0x12, 0x7c, 0x2e, 0xbc, 0xaa, 0xdb, + 0xbe, 0x45, 0xd6, 0x95, 0x83, 0x7d, 0x54, 0x1f, 0x8c, 0x94, + 0x4e, 0xa1, 0x6d, 0x93, 0x58, 0x3e, 0x08, 0x6a, 0xee, 0xfc, + 0x58, 0x31, 0x0d, 0x07, 0x13, 0xd5, 0x40, 0xb8, 0x9d, 0x9a, + 0x08, 0x92, 0x3c, 0x17, 0x6e, 0x0c, 0x39, 0x94, 0x9d, 0xb3, + 0xb7, 0x88, 0x6a, 0x87, 0x3b, 0x90, 0x91, 0xf5, 0x88, 0xd6, + 0x19, 0x48, 0xf3, 0xcf, 0x80, 0x05, 0x76, 0x6e, 0x5b, 0x85, + 0xc9, 0xeb, 0x4a, 0x97, 0xfe, 0xc3, 0xe8, 0x79, 0xd4, 0x63, + 0x0c, 0x55, 0x6c, 0xc3, 0x67, 0x55, 0x3c, 0xbe, 0x7e, 0x83, + 0x1b, 0xbb, 0x35, 0x22, 0xe7, 0x12, 0xb8, 0x79, 0xf4, 0xec, + 0xa0, 0x45, 0x74, 0x62, 0x73, 0x4e, 0x49, 0x87, 0xc0, 0x32, + 0xdd, 0x00, 0xca, 0xff, 0xb4, 0x27, 0x06, 0x4f, 0x7c, 0xd6, + 0xe7, 0x15, 0x3d, 0x44, 0x7a, 0x49, 0x3b, 0xe1, 0x86, 0xb3, + 0xed, 0x92, 0x19, 0x69, 0xf4, 0xec, 0xa0, 0x45, 0x74, 0x60, + 0x73, 0x4e, 0x09, 0xc7, 0xc0, 0x32, 0xdd, 0x02, 0xc8, 0xfd, + 0xb4, 0x27, 0x0e, 0x4e, 0x7c, 0xd2, 0xe5, 0x15, 0x3d, 0x24, + 0x7a, 0x43, 0x3b, 0xa1, 0x86, 0xb3, 0x12, 0x6d, 0xe6, 0x96, + 0x0b, 0x13, 0x5f, 0xba, 0x8b, 0x9f, 0x8c, 0xb1, 0xf6, 0x38, + 0x3f, 0xcd, 0x22, 0xfd, 0x37, 0x02, 0x4b, 0xd8, 0xf1, 0xb1, + 0x83, 0x2d, 0x1a, 0xea, 0xc2, 0xdb, 0x85, 0xbc, 0xc4, 0x5e, + 0x79, 0x4c, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x08, 0xb0, 0x69, 0x3d, 0xf0, 0x30, 0x89, 0x7c, 0x2a, 0x99, + 0x41, 0xf5, 0x62, 0xf2, 0x31, 0xc9, 0x5e, 0xe5, 0x5a, 0x83, + 0x52, 0x81, 0x5a, 0x27, 0xfc, 0xd7, 0x96, 0x34, 0xa1, 0xe5, + 0x6f, 0xa1, 0xec, 0xf3, 0x8d, 0x5d, 0x2d, 0xf9, 0x52, 0x0c, + 0x06, 0x21, 0x95, 0xab, 0x7f, 0xc9, 0xd2, 0x18, 0x1a, 0x70, + 0x9b, 0x03, 0x52, 0xf4, 0x78, 0x1a, 0x7c, 0x38, 0xaf, 0xe5, + 0x95, 0x74, 0xf2, 0x12, 0x9c, 0x75, 0x92, 0x8a, 0x48, 0xec, + 0x73, 0x21, 0x61, 0xbc, 0x75, 0x18, 0xc6, 0x46, 0xee, 0xf5, + 0xbb, 0x5b, 0xb2, 0x03, 0x39, 0x1f, 0x3c, 0x31, 0xba, 0x49, + 0xf0, 0xfb, 0xe4, 0x19, 0x99, 0x17, 0x46, 0x47, 0x4a, 0x46, + 0x98, 0x10, 0x46, 0x8a, 0x7a, 0x76, 0xdf, 0x94, 0x83, 0x20, + 0x27, 0x27, 0x68, 0xc4, 0x9b, 0x4a, 0x56, 0x2c, 0x2d, 0xb5, + 0x14, 0xe0, 0xc6, 0x60, 0x08, 0x08, 0xd2, 0xa8, 0xca, 0xad, + 0x65, 0x6f, 0xa4, 0xf5, 0x63, 0xd1, 0x6a, 0x85, 0xa9, 0xb2, + 0x87, 0x0c, 0xad, 0xe9, 0x8a, 0xc8, 0x2e, 0x39, 0xaa, 0x20, + 0xfa, 0xf7, 0xd7, 0x6d, 0x70, 0x5f, 0xbc, 0x8f, 0x91, 0xaf, + 0xff, 0x60, 0xdc, 0x75, 0x0d, 0x72, 0xe7, 0x79, 0xba, 0x59, + 0x6d, 0x38, 0x8e, 0xce, 0x9e, 0xa4, 0x64, 0x46, 0xf4, 0x22, + 0xfc, 0x28, 0x39, 0x3a, 0xae, 0x3a, 0xba, 0xff, 0x57, 0xe8, + 0x99, 0xbf, 0xfc, 0xe4, 0xc9, 0x8e, 0x2f, 0x60, 0xde, 0x70, + 0xd7, 0xe5, 0xc3, 0x7d, 0x99, 0x1b, 0x00, 0x05, 0xbc, 0x64, + 0xd3, 0xac, 0x04, 0xef, 0xfa, 0x38, 0xfc, 0xa8, 0x3b, 0x3b, + 0xae, 0x2a, 0xba, 0xff, 0x57, 0xa8, 0xb1, 0xbf, 0xfc, 0xe5, + 0xc9, 0xae, 0x2f, 0x60, 0xd8, 0x71, 0xc7, 0xe5, 0xc3, 0x7d, + 0x99, 0x1b, 0x00, 0x05, 0xbc, 0xc4, 0xdb, 0xac, 0x04, 0xee, + 0xfa, 0x38, 0x03, 0x57, 0xc4, 0xc4, 0x51, 0xd5, 0x45, 0x00, + 0xa8, 0x57, 0x4e, 0x40, 0x03, 0x1a, 0x36, 0x51, 0xd0, 0x9f, + 0x27, 0x8e, 0x38, 0x1a, 0x3c, 0x82, 0x66, 0xe4, 0xff, 0xfa, + 0x43, 0x3b, 0x24, 0x53, 0xfb, 0x11, 0x05, 0xc7, 0x02, 0xb3, + 0x45, 0x5e, 0x4a, 0xbb, 0x70, 0x66, 0xcc, 0x3c, 0xf1, 0xbc, + 0xd4, 0x95, 0xc9, 0x36, 0x85, 0x7b, 0xc5, 0xb3, 0xf7, 0xdb, + 0x65, 0xbb, 0xd2, 0xed, 0xb2, 0xae, 0x40, 0x6b, 0x45, 0x46, + 0xd5, 0x07, 0xf4, 0xa7, 0x20, 0x11, 0xe8, 0xaf, 0x4c, 0x8c, + 0x7a, 0xa8, 0xc7, 0x26, 0x74, 0x84, 0x29, 0x59, 0x08, 0x70, + 0xcf, 0x46, 0xf5, 0xd2, 0x99, 0x60, 0x0f, 0x19, 0x5b, 0xf5, + 0x32, 0xc5, 0x4e, 0xf8, 0x76, 0x13, 0x18, 0x00, 0x56, 0x02, + 0x32, 0x2a, 0x78, 0xb5, 0x85, 0xcb, 0x29, 0x91, 0x42, 0xa8, + 0xf1, 0xd7, 0xd5, 0x41, 0x43, 0x87, 0xd2, 0x4a, 0xa3, 0x25, + 0x40, 0x5d, 0x95, 0xf9, 0xe5, 0x01, 0x9e, 0xe0, 0x49, 0xac, + 0x9e, 0x99, 0x85, 0xdf, 0x4b, 0xb3, 0x4f, 0xce, 0x86, 0x49, + 0xb3, 0x3c, 0x59, 0x44, 0x7b, 0x87, 0xb3, 0x30, 0x31, 0xeb, + 0xe5, 0x1c, 0x72, 0x29, 0x8e, 0x80, 0xe1, 0x16, 0xf9, 0x25, + 0xa3, 0xe6, 0xa8, 0x03, 0xfe, 0x25, 0x0f, 0xbf, 0x13, 0x5c, + 0x03, 0x1d, 0xc5, 0xf7, 0x17, 0x9e, 0xa9, 0x18, 0x03, 0x35, + 0x38, 0x17, 0xa7, 0xe8, 0x54, 0x6b, 0x0b, 0xbd, 0xc9, 0x15, + 0xb6, 0x64, 0x2a, 0xe3, 0x08, 0x0a, 0x6a, 0x96, 0x00, 0x5a, + 0xdc, 0x8c, 0x08, 0x17, 0xb6, 0x3d, 0x83, 0x44, 0xdd, 0xca, + 0x60, 0xba, 0x45, 0x10, 0x71, 0x20, 0xf1, 0xaf, 0xd7, 0xc5, + 0x56, 0x6f, 0x8a, 0xa5, 0x80, 0x32, 0xb4, 0x40, 0x2a, 0x67, + 0xe6, 0x25, 0x6b, 0xc7, 0x00, 0x40, 0xd2, 0x85, 0x03, 0xb5, + 0xd2, 0x1f, 0x81, 0x5f, 0xdd, 0xca, 0x62, 0xba, 0x25, 0x10, + 0x79, 0x20, 0xf0, 0xaf, 0xd7, 0xe5, 0x54, 0x6f, 0x8b, 0xa5, + 0x80, 0x20, 0xb6, 0x40, 0x2a, 0xe7, 0xe4, 0x25, 0x6b, 0xc6, + 0x00, 0x40, 0xda, 0x85, 0x03, 0xb7, 0xd2, 0x1f, 0xc1, 0x5f, + 0x22, 0x35, 0x9d, 0x45, 0xda, 0xef, 0x86, 0xdf, 0x0f, 0x50, + 0x28, 0x1a, 0xab, 0x90, 0x74, 0x5a, 0x7f, 0xdf, 0x49, 0xbf, + 0xd5, 0x18, 0x1b, 0xda, 0x94, 0x39, 0xff, 0xbf, 0x25, 0x7a, + 0xfc, 0x48, 0x2d, 0xe0, 0x3e, 0xa0, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x5e, 0x8c, 0xe5, 0x94, 0x09, 0xbf, + 0x89, 0x20, 0xb5, 0xdd, 0x62, 0x7d, 0xab, 0x5e, 0xe2, 0xe9, + 0x9e, 0x0e, 0x41, 0x1b, 0x0d, 0xd5, 0xd6, 0x27, 0xe3, 0x4b, + 0x6f, 0xd3, 0x1b, 0xa4, 0x66, 0x57, 0xfd, 0x22, 0xad, 0x00, + 0x00, 0xa6, 0xe4, 0x7b, 0xe0, 0x76, 0xc0, 0xaa, 0x0c, 0x8e, + 0x84, 0xbf, 0x9c, 0xb9, 0x19, 0x24, 0x53, 0x97, 0x41, 0x8f, + 0x38, 0x30, 0xcd, 0x4c, 0x35, 0x1d, 0xb0, 0x02, 0x8b, 0x64, + 0x0d, 0x6c, 0x1c, 0x60, 0x79, 0x8a, 0x7f, 0x02, 0x13, 0x63, + 0xd8, 0xe3, 0x2a, 0x48, 0x5c, 0x0f, 0x11, 0xaf, 0xc9, 0xc8, + 0x60, 0x87, 0xef, 0x22, 0xc9, 0x53, 0x5f, 0x3f, 0x5c, 0x21, + 0x07, 0x73, 0x3d, 0xd2, 0xc1, 0x55, 0x21, 0x33, 0xd9, 0xa4, + 0xda, 0xa6, 0x43, 0xd6, 0xa7, 0x23, 0xb2, 0x84, 0x33, 0xa4, + 0x26, 0xf5, 0x10, 0x51, 0x26, 0x1b, 0x75, 0x2f, 0x5d, 0xde, + 0x61, 0x26, 0xf1, 0x1d, 0x48, 0x5c, 0x70, 0x37, 0x35, 0x49, + 0x8f, 0x9d, 0xe1, 0x11, 0xf2, 0x07, 0xde, 0x6b, 0xcd, 0xf2, + 0xfe, 0xf9, 0xc4, 0xae, 0x35, 0x67, 0x57, 0xc3, 0xbf, 0x5d, + 0xd1, 0x56, 0x96, 0x1c, 0x44, 0x99, 0x9d, 0x32, 0x76, 0xf4, + 0x03, 0x36, 0x46, 0x97, 0xfc, 0x92, 0x93, 0xce, 0xf4, 0xb6, + 0xb0, 0x15, 0xd1, 0x1e, 0xce, 0xf2, 0xfd, 0xbb, 0xec, 0xb2, + 0x37, 0x27, 0xc3, 0xf3, 0x99, 0xfd, 0xe3, 0x77, 0x18, 0x53, + 0xc4, 0x83, 0x01, 0x68, 0x7e, 0x5f, 0x77, 0x70, 0x66, 0x93, + 0xae, 0x50, 0x09, 0x17, 0xe0, 0x5a, 0xc0, 0x03, 0xf9, 0xdc, + 0xcf, 0xf2, 0xfd, 0xbb, 0xe4, 0xb2, 0x37, 0x27, 0x47, 0xf3, + 0x99, 0xdd, 0xe3, 0x77, 0x18, 0x17, 0xc4, 0x83, 0x41, 0x22, + 0x7e, 0x5f, 0x65, 0x70, 0x66, 0x93, 0xac, 0x52, 0x0b, 0x17, + 0xe0, 0x3a, 0x90, 0x03, 0xf1, 0xde, 0x30, 0x0d, 0x02, 0x44, + 0x1b, 0x4d, 0xc8, 0xd8, 0xb8, 0x0c, 0x66, 0x22, 0x1c, 0x88, + 0xe7, 0xe8, 0x3b, 0x7c, 0xbe, 0xdd, 0x81, 0xa0, 0x9a, 0x8f, + 0x99, 0x6c, 0x53, 0xad, 0xf4, 0xe8, 0x1f, 0xc5, 0x6f, 0xfc, + 0x0e, 0x21, 0xc4, 0xe9, 0x80, 0x29, 0x1b, 0x9a, 0x33, 0xcd, + 0x51, 0x7f, 0xdf, 0x12, 0x1f, 0x1d, 0xcb, 0xcc, 0x75, 0xe6, + 0x78, 0x04, 0x30, 0x24, 0x24, 0x52, 0xbd, 0x34, 0x79, 0x7f, + 0x4a, 0x8a, 0xd0, 0x42, 0xa3, 0x91, 0xc6, 0x25, 0xec, 0xce, + 0x34, 0xc9, 0x0c, 0x1e, 0x70, 0xa9, 0xd2, 0xcb, 0xcc, 0xe7, + 0xd4, 0xfd, 0x83, 0x48, 0xc6, 0x7b, 0x7e, 0x42, 0x19, 0x3b, + 0x3c, 0x05, 0xa9, 0x40, 0xa6, 0xaf, 0x74, 0xfa, 0x59, 0x73, + 0x5f, 0x09, 0x6c, 0x6f, 0x8c, 0x3e, 0x04, 0xb7, 0xf8, 0xd4, + 0xef, 0xda, 0x6a, 0xe2, 0x17, 0x4a, 0x18, 0x38, 0xae, 0x29, + 0xf9, 0x3c, 0x57, 0x0b, 0xcd, 0x64, 0x32, 0x53, 0x32, 0xd8, + 0xd1, 0x68, 0x32, 0xc6, 0x9c, 0x9d, 0x2d, 0x37, 0x81, 0x42, + 0xf8, 0x11, 0xf9, 0x95, 0xe5, 0x21, 0xe1, 0x26, 0x38, 0x18, + 0x4e, 0xcc, 0x17, 0xb0, 0x2a, 0x1e, 0x80, 0x10, 0x36, 0xfd, + 0x42, 0x1b, 0xf6, 0x4c, 0x8d, 0x9a, 0x55, 0x0a, 0x0f, 0x7e, + 0x76, 0xe8, 0xc6, 0x6f, 0x7a, 0x63, 0xd1, 0xe4, 0x58, 0x95, + 0x85, 0x12, 0x02, 0x02, 0xc3, 0xb7, 0x4e, 0x96, 0x4c, 0x8b, + 0xd8, 0xf6, 0xce, 0xd4, 0x89, 0xee, 0xa4, 0x4b, 0xd7, 0x00, + 0x50, 0x98, 0x94, 0x99, 0x26, 0x01, 0xa2, 0x38, 0x0d, 0x80, + 0xb7, 0x13, 0xfd, 0xe0, 0x78, 0x37, 0x29, 0x92, 0x39, 0x0a, + 0xc3, 0xa7, 0x96, 0x8a, 0x44, 0x0d, 0x5b, 0xde, 0xde, 0x52, + 0x8a, 0xae, 0xb0, 0x48, 0x97, 0x10, 0x7c, 0x81, 0x50, 0xbc, + 0x06, 0x1a, 0xb9, 0xaa, 0x44, 0x63, 0x2a, 0x61, 0xf9, 0xe0, + 0x78, 0xb5, 0x2d, 0x92, 0x38, 0x0a, 0xc3, 0xa7, 0xd6, 0xca, + 0x44, 0x19, 0xda, 0xde, 0xde, 0x50, 0x8a, 0xae, 0xb0, 0x48, + 0x97, 0x10, 0x7c, 0x81, 0x50, 0xbc, 0x06, 0x1a, 0xbb, 0xa8, + 0x44, 0x63, 0x3f, 0x61, 0x06, 0x1f, 0x87, 0x4a, 0xd2, 0x6d, + 0xc7, 0xf5, 0x3c, 0x58, 0x29, 0x35, 0xbb, 0xe6, 0x25, 0x21, + 0x21, 0xaf, 0x75, 0x51, 0x4f, 0xb7, 0x68, 0xef, 0x83, 0x7e, + 0xaf, 0x43, 0xf9, 0xe5, 0x44, 0x57, 0xbb, 0x9c, 0xc0, 0x9e, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, 0x4c, + 0xe2, 0xf8, 0xca, 0x04, 0xba, 0x55, 0x88, 0x7c, 0xb0, 0x5a, + 0x80, 0x66, 0x17, 0x3f, 0x3e, 0x3a, 0xdd, 0x3e, 0xbd, 0x7d, + 0x53, 0x7e, 0xf5, 0x2d, 0xfd, 0xd3, 0xc0, 0x78, 0xba, 0xf6, + 0x0b, 0xe0, 0xca, 0x1d, 0xcd, 0x1a, 0xe1, 0x6c, 0xc2, 0x50, + 0x84, 0xec, 0x3f, 0xfb, 0xe1, 0x09, 0x88, 0x60, 0x0c, 0x6b, + 0xa8, 0x58, 0x3b, 0x3b, 0x06, 0x5a, 0x5a, 0xa4, 0x5d, 0x0c, + 0xd0, 0xe4, 0x7c, 0x57, 0x06, 0xdc, 0xc4, 0x4c, 0xe6, 0x0d, + 0xc4, 0x5e, 0xec, 0x2a, 0x7b, 0xf9, 0x03, 0x5e, 0x21, 0xe2, + 0xd9, 0x52, 0x05, 0x7c, 0xb6, 0xa9, 0x8c, 0x47, 0x18, 0x2f, + 0x2a, 0x56, 0xb1, 0xcc, 0x6c, 0xf4, 0xfa, 0x3c, 0xd9, 0xe4, + 0x40, 0x55, 0x9d, 0x81, 0x45, 0x65, 0xe8, 0x0d, 0x52, 0xca, + 0x75, 0x4e, 0x49, 0xb4, 0xf6, 0xe0, 0x24, 0x97, 0x9d, 0xff, + 0x10, 0x01, 0x4a, 0xce, 0x58, 0x55, 0x91, 0x2c, 0x19, 0x78, + 0xfe, 0x75, 0x21, 0x56, 0x32, 0xe2, 0x4f, 0x00, 0x85, 0x23, + 0x7d, 0x12, 0x9e, 0xfa, 0xcc, 0xcc, 0xab, 0x02, 0x71, 0xe0, + 0x7f, 0x82, 0x78, 0x0b, 0x47, 0x22, 0x70, 0xa2, 0xd8, 0xa7, + 0x54, 0xec, 0x0b, 0x8f, 0x8a, 0x1b, 0xd8, 0x74, 0xdb, 0xc4, + 0x68, 0xdb, 0xa4, 0xb1, 0xf6, 0x07, 0x48, 0x6d, 0xd8, 0xa2, + 0xec, 0xcc, 0x16, 0xda, 0xaf, 0xc4, 0x35, 0xf2, 0x58, 0x83, + 0x43, 0xf6, 0x30, 0xa8, 0xdb, 0x25, 0x76, 0x3d, 0xa3, 0xba, + 0x2d, 0xd9, 0xf8, 0x58, 0x5d, 0x94, 0xaa, 0x53, 0xe4, 0xb0, + 0x36, 0x07, 0x7d, 0xa4, 0xd8, 0xe2, 0xec, 0xcc, 0x16, 0xda, + 0xa7, 0xe4, 0x37, 0xe2, 0x58, 0x83, 0x47, 0xf6, 0x10, 0xa0, + 0xdb, 0xa5, 0x56, 0x3d, 0x83, 0xbe, 0xad, 0xd9, 0xf8, 0x5c, + 0xdd, 0xc4, 0xaa, 0x53, 0xe4, 0xb0, 0x36, 0x07, 0x6d, 0x24, + 0x27, 0x1d, 0x13, 0x33, 0xe9, 0x25, 0x58, 0x1b, 0xc8, 0x1d, + 0xa7, 0x7c, 0xb8, 0x09, 0xef, 0x5f, 0x24, 0x5a, 0xa9, 0xc2, + 0x7c, 0x41, 0x52, 0x26, 0x07, 0xa3, 0x22, 0x3b, 0x55, 0xac, + 0x1b, 0x4f, 0xc9, 0xf8, 0x92, 0xdb, 0xf5, 0xc8, 0x2c, 0x56, + 0x56, 0x94, 0x49, 0x68, 0xc1, 0xf7, 0x03, 0x1d, 0x14, 0x4b, + 0x24, 0xd7, 0xb0, 0x02, 0x28, 0x24, 0x23, 0x70, 0x83, 0x59, + 0x14, 0x49, 0x2f, 0xcf, 0xb7, 0x13, 0xf0, 0xac, 0x7b, 0x4f, + 0xcb, 0x65, 0xbb, 0x3a, 0xf8, 0x25, 0x4c, 0x22, 0x6c, 0xa1, + 0x46, 0xd3, 0x61, 0xdb, 0xf6, 0x42, 0xcd, 0x5e, 0x96, 0x6d, + 0x2a, 0xb1, 0xdd, 0x01, 0x43, 0xce, 0xf7, 0x2c, 0x4e, 0x37, + 0x50, 0xcb, 0xe9, 0x5d, 0x80, 0x7a, 0x4f, 0x16, 0x12, 0x7f, + 0xe6, 0x9c, 0x31, 0xe7, 0x1e, 0xff, 0xbc, 0x52, 0x6d, 0x11, + 0xf2, 0xae, 0x59, 0x96, 0x55, 0x8a, 0x2e, 0xa2, 0x98, 0x4a, + 0xe4, 0x24, 0x89, 0x55, 0xb0, 0x67, 0x21, 0x73, 0x12, 0x47, + 0x42, 0x66, 0x01, 0xca, 0x66, 0xa2, 0x2c, 0x07, 0x2b, 0xc5, + 0x5e, 0x5c, 0x8a, 0xca, 0xdf, 0x2e, 0x02, 0xaf, 0x43, 0xcd, + 0xb0, 0x83, 0x8b, 0xdb, 0x51, 0x75, 0xf6, 0xca, 0x43, 0x63, + 0x3a, 0x93, 0x40, 0x02, 0x52, 0x60, 0x7c, 0x50, 0x37, 0x79, + 0x0a, 0xaa, 0x0c, 0x20, 0x4e, 0xde, 0xd0, 0x6a, 0xda, 0xfe, + 0xef, 0xb6, 0x88, 0x87, 0x01, 0xb5, 0x4c, 0xda, 0x94, 0x6c, + 0xaa, 0xc6, 0x58, 0x8b, 0x16, 0xc7, 0x7d, 0x6e, 0x6d, 0x0a, + 0x79, 0x76, 0x01, 0x07, 0xb9, 0x85, 0x0c, 0xaf, 0xa4, 0xc2, + 0x4a, 0xeb, 0xbd, 0x78, 0xc1, 0xc3, 0xae, 0x37, 0x08, 0x0f, + 0x61, 0x93, 0x4e, 0xd1, 0x1e, 0x79, 0xd7, 0x82, 0xf1, 0x82, + 0xd7, 0x86, 0xfc, 0x12, 0xc1, 0x02, 0x79, 0x47, 0xd1, 0x87, + 0x3d, 0xdf, 0x4c, 0xae, 0xa4, 0xc2, 0x4e, 0xef, 0xfc, 0x78, + 0xc3, 0xeb, 0xaf, 0x37, 0x00, 0x0f, 0x61, 0x93, 0x4e, 0xd1, + 0x1e, 0x79, 0xb3, 0xc2, 0xf1, 0x82, 0xd3, 0x86, 0xfc, 0x12, + 0xc1, 0x02, 0x79, 0x66, 0x51, 0x87, 0x3d, 0x9f, 0xb3, 0x51, + 0x5b, 0x3d, 0xb1, 0x10, 0x03, 0x87, 0x3c, 0x14, 0x50, 0xc8, + 0xff, 0xf0, 0x9e, 0x6c, 0xb1, 0x2e, 0xe1, 0x86, 0x4c, 0x3d, + 0x0e, 0x7d, 0x2c, 0x79, 0x03, 0xed, 0x3e, 0xfd, 0x86, 0x99, + 0xae, 0x78, 0xc2, 0x60, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x39, 0x04, 0xdf, 0x14, 0x5d, 0x57, 0x3d, 0xb9, + 0x1c, 0xc3, 0x1b, 0x0a, 0x74, 0xb8, 0x81, 0x09, 0x55, 0x55, + 0xb3, 0x9d, 0x0d, 0x7a, 0x83, 0xa8, 0x48, 0x47, 0x7a, 0x3d, + 0xff, 0xd1, 0xaf, 0xe1, 0xec, 0x2b, 0xe7, 0x67, 0xed, 0xe9, + 0xef, 0x2b, 0xdf, 0x10, 0x57, 0x78, 0x04, 0x1e, 0x3e, 0xcf, + 0x50, 0xe1, 0x5b, 0x41, 0xe2, 0x52, 0x56, 0x24, 0xac, 0x0f, + 0x43, 0x6d, 0x6b, 0x5e, 0x23, 0x1e, 0x96, 0xb2, 0x9e, 0x1a, + 0xd6, 0xa2, 0xfb, 0x90, 0x79, 0x04, 0x6d, 0xc4, 0x80, 0xc3, + 0x6d, 0x1f, 0x16, 0xee, 0x1c, 0x59, 0xf2, 0x6f, 0x7c, 0x15, + 0xbb, 0x4d, 0x5d, 0x24, 0x92, 0xfd, 0x4f, 0x3f, 0x05, 0xc2, + 0x9e, 0x5b, 0x78, 0x08, 0x84, 0x9a, 0x46, 0x82, 0xf4, 0x41, + 0x9a, 0x79, 0x68, 0x08, 0x61, 0x1e, 0x95, 0x67, 0x8d, 0x77, + 0xc9, 0x33, 0xbf, 0x7b, 0xa7, 0x00, 0x34, 0xf8, 0x26, 0x0b, + 0x66, 0x59, 0xf7, 0x75, 0xbd, 0x65, 0x79, 0xf1, 0x8b, 0xf1, + 0x18, 0x86, 0x04, 0x91, 0x54, 0xa0, 0xfa, 0x30, 0xaf, 0x3a, + 0xce, 0xad, 0x0c, 0xc3, 0x3d, 0x11, 0x01, 0x12, 0xa6, 0x9e, + 0x5b, 0xa8, 0xf0, 0x6e, 0x7f, 0x49, 0x25, 0x3c, 0x70, 0x87, + 0xb8, 0xae, 0x22, 0xff, 0xc3, 0xbf, 0xab, 0xf0, 0x3b, 0xb6, + 0x52, 0xc3, 0x8a, 0x62, 0xeb, 0x8a, 0x45, 0x88, 0xc6, 0xde, + 0x5f, 0x02, 0xc8, 0xe8, 0x97, 0x7c, 0x82, 0x38, 0x73, 0x60, + 0x4b, 0x40, 0x42, 0x38, 0xf0, 0x15, 0x3a, 0x26, 0x7a, 0xf5, + 0xad, 0xaf, 0xcb, 0x42, 0xf3, 0xb4, 0xd4, 0xc3, 0x9a, 0x62, + 0xeb, 0x8a, 0x45, 0x8c, 0x84, 0xde, 0x5f, 0x02, 0x48, 0xf8, + 0x97, 0x3e, 0x92, 0x38, 0x71, 0x60, 0x4b, 0x40, 0x06, 0x38, + 0xf0, 0x15, 0xb8, 0x26, 0x7a, 0xf5, 0xaf, 0xaf, 0xcb, 0x42, + 0xf3, 0xb4, 0x54, 0xc1, 0x65, 0x9d, 0x14, 0x75, 0xba, 0x73, + 0x7b, 0x21, 0xa0, 0xfd, 0xb7, 0x07, 0x68, 0xc1, 0x6d, 0xc7, + 0x8e, 0x9f, 0xb4, 0xbf, 0xf9, 0xc7, 0x0f, 0xea, 0x47, 0xd9, + 0x85, 0x0a, 0x50, 0x50, 0x34, 0xbd, 0x0c, 0x4b, 0xab, 0x3e, + 0xce, 0x72, 0x02, 0x24, 0x61, 0x4e, 0x7e, 0xfc, 0xd1, 0x99, + 0x90, 0x8e, 0xb5, 0x9d, 0x51, 0x96, 0xbb, 0x0b, 0x61, 0x9e, + 0xf3, 0xfa, 0x25, 0xb1, 0x9c, 0xdd, 0xb1, 0xb3, 0x09, 0xf4, + 0xe1, 0x35, 0xce, 0xa4, 0xa8, 0xf5, 0xd3, 0x3f, 0xb6, 0xad, + 0x39, 0x9b, 0x18, 0x35, 0x41, 0x6f, 0xb5, 0x3a, 0x72, 0x1b, + 0xac, 0x28, 0x6c, 0xeb, 0x2c, 0xc4, 0xdd, 0x1f, 0x90, 0xa6, + 0xbb, 0xef, 0xa3, 0x50, 0xbd, 0x26, 0xf0, 0xb5, 0x69, 0x45, + 0xd3, 0x1f, 0xff, 0x94, 0xb6, 0x12, 0x13, 0xfc, 0x50, 0x7f, + 0xff, 0x03, 0x41, 0x2b, 0x3b, 0x9f, 0x0a, 0x29, 0x9d, 0x82, + 0x7d, 0xc7, 0x92, 0x2f, 0x42, 0xd1, 0x7a, 0xbd, 0x32, 0x94, + 0x41, 0x64, 0xd1, 0xf3, 0x07, 0x9c, 0xd5, 0xb9, 0x33, 0xda, + 0x2f, 0x3e, 0xe0, 0x74, 0xb6, 0x7f, 0x82, 0x05, 0x02, 0x8d, + 0x57, 0x1e, 0x80, 0xd3, 0x02, 0xa3, 0x40, 0x87, 0x4d, 0x62, + 0xa6, 0xc6, 0x09, 0x7b, 0x10, 0xcd, 0x67, 0xec, 0x50, 0x17, + 0x21, 0xc8, 0xeb, 0x05, 0xc7, 0xb0, 0xc6, 0x60, 0x4a, 0xa2, + 0xaa, 0xbf, 0x4e, 0xfe, 0x9b, 0x2f, 0x83, 0x7c, 0xcc, 0x67, + 0xb3, 0xfa, 0x6b, 0x81, 0x5d, 0x92, 0x08, 0x2c, 0x0c, 0x5c, + 0x45, 0x91, 0x11, 0x91, 0x33, 0x1e, 0x2e, 0xf8, 0xf5, 0x2c, + 0xb7, 0x98, 0x8e, 0xc0, 0x87, 0x97, 0x8c, 0x36, 0x43, 0x8e, + 0xaf, 0x2f, 0x13, 0x90, 0xed, 0x6c, 0x87, 0x71, 0x76, 0xa7, + 0x49, 0xc0, 0x22, 0x1a, 0x0d, 0x39, 0x5c, 0x81, 0xb3, 0x81, + 0x39, 0xb6, 0x2a, 0xb2, 0xa3, 0xa5, 0xb7, 0x98, 0x8e, 0xc0, + 0x83, 0x9e, 0xac, 0x37, 0x43, 0x8e, 0xaf, 0x2f, 0x12, 0x98, + 0xec, 0x6e, 0x87, 0x71, 0x76, 0xa7, 0x49, 0xc0, 0x22, 0x1a, + 0x0d, 0x39, 0x5c, 0x89, 0xb3, 0x91, 0x31, 0x96, 0x2a, 0xb0, + 0xa3, 0xa5, 0x48, 0x67, 0x71, 0x3f, 0x7c, 0x61, 0x53, 0xc8, + 0xbc, 0x71, 0x50, 0xd0, 0xec, 0x67, 0x13, 0x91, 0x78, 0x8e, + 0x89, 0x58, 0xb6, 0x3f, 0xdd, 0xe5, 0xf2, 0xc6, 0xa3, 0x76, + 0x4c, 0x6e, 0xce, 0x69, 0xd5, 0x4f, 0x5c, 0x5a, 0x05, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x16, 0xdf, 0x9b, 0x40, + 0x89, 0x8e, 0xcc, 0x66, 0x93, 0xa9, 0xbd, 0xc9, 0x51, 0xde, + 0x26, 0x23, 0x0e, 0x6b, 0x72, 0x28, 0x09, 0x13, 0xf0, 0x3a, + 0x26, 0x29, 0x3a, 0xc5, 0xf1, 0xd7, 0xf1, 0x1e, 0x68, 0x03, + 0xb1, 0xb1, 0x8e, 0xe3, 0xbf, 0x49, 0xad, 0xaf, 0xac, 0xe0, + 0x23, 0x27, 0x87, 0xe9, 0x5e, 0x0d, 0x21, 0xc6, 0xa6, 0xa0, + 0x07, 0xcd, 0x90, 0x2e, 0x90, 0x5b, 0x29, 0x0b, 0xcb, 0x15, + 0x0c, 0xf9, 0xab, 0xc8, 0xe0, 0x54, 0x48, 0x54, 0x34, 0x8b, + 0xa7, 0x9e, 0x52, 0xde, 0x11, 0x4b, 0x30, 0x43, 0xf3, 0x5e, + 0xdd, 0x08, 0x16, 0x48, 0x36, 0x71, 0xb8, 0x05, 0x3a, 0x14, + 0x2e, 0x5b, 0xcc, 0x2a, 0xd4, 0x88, 0x4e, 0x0f, 0x7e, 0xbc, + 0x0c, 0x48, 0x8a, 0x49, 0xbb, 0xbf, 0x92, 0xa8, 0xa7, 0x5c, + 0x3b, 0x19, 0x81, 0x9b, 0xac, 0xe8, 0x62, 0x5c, 0xa3, 0x4a, + 0x93, 0x0a, 0x74, 0xb9, 0x28, 0x91, 0x2b, 0xae, 0xf6, 0xb7, + 0x70, 0x96, 0x6e, 0xd1, 0xa2, 0x39, 0xd1, 0x5d, 0x8a, 0x42, + 0x1a, 0xe9, 0x55, 0x84, 0x90, 0xf4, 0xbd, 0x85, 0xcd, 0xcb, + 0xa7, 0xfc, 0x36, 0xbf, 0x80, 0xaa, 0x3a, 0x64, 0xd5, 0x6f, + 0xeb, 0x2c, 0x31, 0x07, 0xe4, 0xbb, 0x93, 0xcf, 0xc9, 0x89, + 0xd9, 0xda, 0xb3, 0x54, 0xc1, 0x97, 0x3a, 0x33, 0xdd, 0x2c, + 0x25, 0xd4, 0xfd, 0xf9, 0x91, 0xda, 0xee, 0xea, 0x2a, 0xde, + 0x54, 0x48, 0x31, 0xc4, 0xd9, 0xbb, 0x79, 0x2d, 0x36, 0xef, + 0x04, 0xa3, 0x02, 0x96, 0x48, 0x49, 0x88, 0xe8, 0x53, 0x65, + 0xe3, 0xe2, 0x3b, 0x23, 0xdd, 0x0c, 0x25, 0xd4, 0xfd, 0xb9, + 0x81, 0xca, 0xee, 0xea, 0x2a, 0xfe, 0x44, 0x48, 0x31, 0x44, + 0xd9, 0xab, 0x69, 0x2d, 0x36, 0xef, 0x04, 0xa3, 0x02, 0x96, + 0x48, 0x49, 0x98, 0xf8, 0x53, 0x75, 0xc3, 0xc2, 0xc4, 0xdc, + 0x22, 0xf3, 0xda, 0x2b, 0x02, 0x46, 0x7e, 0x35, 0x11, 0x15, + 0xd5, 0x01, 0xbb, 0xb7, 0xce, 0xbb, 0x26, 0x54, 0x96, 0xd2, + 0xc9, 0x10, 0xfb, 0x5c, 0xfd, 0x69, 0xb7, 0xb6, 0x67, 0x07, + 0xac, 0x8a, 0x3c, 0x3d, 0xb8, 0x6d, 0x43, 0x90, 0x58, 0xb3, + 0x49, 0xe2, 0x24, 0x01, 0x45, 0x8b, 0x83, 0x6a, 0x73, 0x6b, + 0xff, 0x8c, 0xec, 0x11, 0x8b, 0x61, 0xb8, 0x95, 0x7c, 0xe9, + 0x20, 0x71, 0xa6, 0x3f, 0x20, 0xd0, 0xb8, 0xa9, 0xd3, 0xf3, + 0x97, 0xf1, 0x5d, 0x0f, 0xcd, 0xc2, 0x4b, 0x15, 0x11, 0x7e, + 0xca, 0x46, 0xf0, 0xea, 0x90, 0x7d, 0x93, 0x28, 0xb2, 0x0f, + 0x6f, 0x63, 0x79, 0x74, 0x3a, 0x3b, 0x07, 0x73, 0x6a, 0x77, + 0xba, 0x1c, 0x16, 0xca, 0x3f, 0xde, 0xd9, 0x97, 0x8d, 0x7c, + 0x1b, 0xbf, 0xb6, 0x77, 0x21, 0x17, 0x62, 0xc5, 0x8b, 0xce, + 0xbc, 0xc6, 0x09, 0xd1, 0xae, 0x1e, 0xd1, 0x5b, 0x2d, 0x7d, + 0xa1, 0xad, 0xb5, 0x14, 0x18, 0xef, 0x86, 0xb2, 0xbe, 0x99, + 0x46, 0x5f, 0x39, 0x95, 0xfa, 0x67, 0xe6, 0x08, 0xcd, 0xf4, + 0xa3, 0x5b, 0x58, 0xa4, 0x3c, 0x71, 0x63, 0xf1, 0x36, 0xcc, + 0xaa, 0x5f, 0xda, 0xe4, 0x53, 0x4f, 0x34, 0x15, 0x78, 0xd0, + 0xc4, 0xb4, 0x60, 0x91, 0xbb, 0x29, 0x03, 0xb9, 0xf3, 0xf7, + 0xc1, 0x96, 0x60, 0xbb, 0xa0, 0x4a, 0xb3, 0xb8, 0x7f, 0x07, + 0x30, 0xa8, 0xfb, 0x7f, 0x1d, 0xcb, 0x70, 0x61, 0x52, 0x75, + 0x46, 0xb1, 0x2d, 0x47, 0x3d, 0x2f, 0x8a, 0x03, 0x96, 0x1e, + 0x2e, 0x13, 0x7b, 0x03, 0xf7, 0xd5, 0xc9, 0xb0, 0x6c, 0xaa, + 0x20, 0xc6, 0x86, 0xe2, 0xfe, 0x50, 0xf2, 0x2b, 0x6e, 0xaf, + 0x9b, 0xfb, 0xda, 0x41, 0x17, 0xd3, 0x5a, 0x37, 0x30, 0x46, + 0xac, 0x70, 0x9a, 0x7b, 0x00, 0x1e, 0x3b, 0x93, 0x77, 0x0e, + 0xf3, 0xd5, 0xc9, 0xb0, 0x6c, 0xba, 0x20, 0x46, 0x83, 0xaa, + 0xf6, 0x00, 0xf2, 0x2b, 0x6a, 0xaf, 0x9f, 0xcb, 0xda, 0x41, + 0x17, 0xd3, 0x5a, 0x37, 0x34, 0x46, 0xac, 0x70, 0x9a, 0x73, + 0x00, 0x1e, 0x3b, 0x93, 0x77, 0x0b, 0x0c, 0x2a, 0x36, 0x4f, + 0x93, 0x45, 0xdf, 0xb9, 0x7c, 0x55, 0x09, 0xff, 0x0d, 0xd4, + 0x95, 0x50, 0x60, 0x34, 0x25, 0xbe, 0xe8, 0x2c, 0xa5, 0xc8, + 0xcb, 0xb9, 0x53, 0x8f, 0x65, 0x8c, 0xff, 0xe1, 0xc4, 0x6c, + 0x88, 0xf4, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0a, 0xa0, 0x1d, 0x4e, 0xe6, 0xac, 0xf3, 0xf3, 0xb4, 0xe8, + 0x06, 0x13, 0x46, 0x9b, 0x76, 0xee, 0x9a, 0x6c, 0x83, 0x07, + 0x09, 0xd8, 0xd2, 0x20, 0x34, 0xcd, 0xb8, 0x6b, 0x07, 0x94, + 0x8b, 0x1c, 0x02, 0x95, 0xb5, 0xb7, 0x20, 0x48, 0x98, 0xfe, + 0x48, 0xbf, 0x08, 0xcb, 0x1e, 0xbf, 0x1a, 0x00, 0x72, 0x8e, + 0xd9, 0x6f, 0x47, 0xe5, 0xa9, 0x37, 0x64, 0xc4, 0xbf, 0xa4, + 0x52, 0x17, 0x27, 0x45, 0x40, 0x43, 0x21, 0xb6, 0x3a, 0xe0, + 0xfa, 0x06, 0x9b, 0x43, 0x09, 0x6c, 0x22, 0x64, 0x42, 0xde, + 0x3f, 0xdc, 0x6c, 0xd0, 0xcd, 0xac, 0x5d, 0xe7, 0x01, 0x69, + 0x01, 0x8f, 0x69, 0x02, 0xdb, 0xa9, 0xc9, 0x93, 0xd7, 0x71, + 0x8a, 0x5c, 0xf1, 0x25, 0x3d, 0x34, 0x80, 0x59, 0xeb, 0x54, + 0x01, 0x59, 0x52, 0x54, 0x3f, 0x70, 0xa6, 0xd5, 0x56, 0x06, + 0x96, 0x0e, 0x1b, 0xb1, 0xc1, 0xe6, 0x95, 0xc5, 0x6e, 0x51, + 0xe1, 0x08, 0x77, 0x72, 0x5e, 0xbf, 0xaf, 0xdc, 0x84, 0x51, + 0xe1, 0x4c, 0x16, 0xac, 0xcd, 0x34, 0x63, 0x94, 0xc3, 0x56, + 0x6b, 0x69, 0x6b, 0x92, 0x7e, 0xa2, 0x96, 0x28, 0xea, 0xa6, + 0x21, 0x85, 0x14, 0xee, 0xe9, 0xa3, 0xc5, 0x94, 0x39, 0x60, + 0x42, 0xa9, 0xfd, 0xb8, 0x88, 0x45, 0x57, 0x37, 0xad, 0x60, + 0xcc, 0x7e, 0x03, 0xa4, 0x61, 0x16, 0x43, 0x2e, 0xf9, 0x99, + 0x7e, 0xaa, 0xd3, 0xad, 0xd8, 0xa5, 0x44, 0x83, 0x15, 0xd7, + 0xad, 0x85, 0xf7, 0x02, 0x3a, 0x83, 0x96, 0xa3, 0x8c, 0xde, + 0x98, 0x35, 0x6d, 0x2c, 0x0d, 0x25, 0xcc, 0x7e, 0x03, 0xa4, + 0x61, 0x16, 0x4b, 0x2a, 0xfb, 0x99, 0x7e, 0xa2, 0xd7, 0xad, + 0xd8, 0xa5, 0x41, 0x83, 0x15, 0xc7, 0xad, 0x85, 0xf7, 0x02, + 0x3a, 0x83, 0x96, 0xab, 0x8c, 0xde, 0x98, 0x75, 0x6d, 0x2c, + 0x0d, 0x25, 0x33, 0x81, 0xfc, 0x5b, 0x9e, 0xe9, 0xb4, 0xd5, + 0x04, 0x66, 0x81, 0x5d, 0x28, 0x52, 0x27, 0x5a, 0xbe, 0x7c, + 0xea, 0x38, 0x52, 0x7a, 0x08, 0xfd, 0xc5, 0x7c, 0x69, 0x54, + 0x73, 0x21, 0x67, 0x8a, 0x92, 0xd3, 0xf2, 0xda, 0xbd, 0x2e, + 0x77, 0x1e, 0x9e, 0x27, 0x73, 0xb7, 0x0d, 0xb4, 0xcb, 0x16, + 0x65, 0x21, 0x94, 0xed, 0xa8, 0x68, 0x2f, 0xa9, 0x0d, 0x6e, + 0x15, 0x96, 0x9d, 0xdd, 0x42, 0xba, 0xae, 0x1a, 0x87, 0x6a, + 0x1d, 0x5e, 0x58, 0xb9, 0x6d, 0x59, 0xc0, 0x0e, 0xb6, 0x5c, + 0x6b, 0x5e, 0xab, 0xc3, 0x99, 0xf5, 0xd2, 0x13, 0xa2, 0xf7, + 0x71, 0x1b, 0x54, 0x51, 0x88, 0xe7, 0x2a, 0xab, 0xa6, 0x87, + 0x82, 0xf8, 0x80, 0xe6, 0xc7, 0x0d, 0x10, 0x9f, 0xc4, 0x94, + 0xfe, 0xd6, 0x92, 0x91, 0x1e, 0x22, 0x92, 0x2c, 0x80, 0xf2, + 0xf7, 0x22, 0x55, 0x89, 0x62, 0xd5, 0xcc, 0x31, 0x68, 0xbc, + 0x4a, 0x37, 0xc2, 0xd4, 0x90, 0xe1, 0xb4, 0x33, 0x8e, 0xd8, + 0x1e, 0x02, 0x42, 0x8a, 0xa3, 0x1d, 0x08, 0x0b, 0x46, 0x78, + 0xa9, 0x6c, 0x6d, 0x67, 0x4e, 0x18, 0x36, 0x67, 0x54, 0xf4, + 0x3b, 0x2a, 0xf6, 0xf4, 0xd7, 0x6f, 0x49, 0xbb, 0xfa, 0xfa, + 0x22, 0xb3, 0x06, 0x05, 0x50, 0xe8, 0x62, 0xc6, 0x4e, 0x50, + 0x33, 0x97, 0x38, 0x8d, 0xaa, 0x40, 0xad, 0x7d, 0x70, 0xe0, + 0x24, 0x37, 0x34, 0xfa, 0x28, 0x2c, 0xa7, 0x0b, 0xfb, 0x34, + 0x42, 0xb2, 0x89, 0x6a, 0xb7, 0x82, 0x87, 0x5d, 0x7b, 0x93, + 0x8f, 0xbd, 0xf5, 0x04, 0x15, 0xa9, 0xfb, 0x70, 0x82, 0x85, + 0x9b, 0xdc, 0xbd, 0x75, 0xff, 0xf2, 0x03, 0xf3, 0x76, 0xf6, + 0x69, 0x2c, 0x10, 0x4a, 0xc6, 0xf4, 0x64, 0x3c, 0x0b, 0x33, + 0x37, 0x42, 0xd7, 0x39, 0xfe, 0x92, 0x1d, 0xbf, 0x66, 0x8c, + 0xd5, 0x8b, 0x03, 0x76, 0x02, 0x85, 0x8a, 0x58, 0xbd, 0x75, + 0xfd, 0xe2, 0x07, 0xf3, 0x36, 0xf6, 0x69, 0x2c, 0x13, 0x4a, + 0xc6, 0xb4, 0x66, 0x3c, 0x0b, 0x33, 0x37, 0xc2, 0xd7, 0x39, + 0x7a, 0x92, 0x1f, 0xbf, 0x66, 0x8c, 0xd5, 0x8b, 0x23, 0x76, + 0xfd, 0x7a, 0x75, 0xa7, 0x42, 0x8a, 0x02, 0x1d, 0xf8, 0x0c, + 0xc9, 0x09, 0x96, 0xd3, 0xec, 0xb5, 0x39, 0x4b, 0x99, 0xc3, + 0xf4, 0xcc, 0xc8, 0x3d, 0x28, 0xc6, 0x85, 0x6d, 0xe0, 0x40, + 0x99, 0x73, 0x2a, 0x74, 0xdc, 0x89, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0xe6, 0x0d, 0x2b, 0x9a, 0x9b, 0xc4, + 0xaf, 0xd4, 0x6b, 0x51, 0xfb, 0x3e, 0x14, 0x46, 0xe3, 0xa2, + 0xc3, 0xc8, 0x07, 0xb3, 0xd5, 0x54, 0xc4, 0x0a, 0xe0, 0xf5, + 0x9c, 0x3c, 0xec, 0xd0, 0x4d, 0x38, 0x79, 0x6a, 0xdf, 0xd3, + 0x83, 0xc6, 0xbe, 0x9c, 0xb5, 0xc0, 0xe0, 0x79, 0xc1, 0x85, + 0xae, 0x15, 0xe7, 0x14, 0x57, 0x7f, 0x43, 0x67, 0xc8, 0x40, + 0x7f, 0xb5, 0xed, 0xa4, 0xd3, 0x44, 0x59, 0x84, 0xef, 0xfc, + 0xf1, 0x40, 0x39, 0x7b, 0xd9, 0x17, 0x6e, 0x12, 0xbf, 0x77, + 0x61, 0xd1, 0x23, 0x22, 0x36, 0x29, 0x59, 0x45, 0x98, 0x18, + 0xc1, 0x0b, 0xad, 0x22, 0x80, 0xf6, 0x46, 0x54, 0xf2, 0x58, + 0x98, 0x8b, 0xe5, 0xe3, 0x9c, 0x65, 0x23, 0x6c, 0x27, 0x89, + 0xcb, 0xa5, 0x87, 0x94, 0x78, 0x8d, 0x1b, 0xc3, 0x2a, 0x9c, + 0xa3, 0xd3, 0x78, 0x32, 0xe1, 0xd1, 0x6d, 0x66, 0x1a, 0x5b, + 0xc6, 0xbb, 0x22, 0xa0, 0xbd, 0x40, 0xb4, 0x36, 0x93, 0x61, + 0x86, 0x11, 0x7c, 0x11, 0xac, 0xd1, 0x60, 0xec, 0x8d, 0x06, + 0xb6, 0x95, 0x7e, 0x78, 0x98, 0x95, 0x14, 0xbe, 0x77, 0x16, + 0xf9, 0x64, 0xcd, 0xd8, 0x1d, 0x8a, 0x75, 0x94, 0x69, 0x2e, + 0x75, 0x49, 0xa6, 0xe9, 0xb9, 0x8b, 0x84, 0xa1, 0xb4, 0x1a, + 0xd7, 0x68, 0xa0, 0xf4, 0x9f, 0x75, 0x30, 0x95, 0xb9, 0x4d, + 0x9a, 0xd7, 0x36, 0x92, 0x76, 0x76, 0xb6, 0x50, 0xe8, 0x4e, + 0x17, 0x1a, 0x40, 0xfd, 0x69, 0x7a, 0xef, 0x6a, 0xba, 0x66, + 0xf5, 0xc9, 0x14, 0xd5, 0xfc, 0x52, 0x0f, 0x29, 0xc5, 0xf4, + 0x8f, 0x55, 0x30, 0x95, 0xb9, 0x4d, 0x9a, 0xd5, 0x36, 0x92, + 0x76, 0x76, 0xb6, 0x50, 0xc9, 0x4e, 0x17, 0x1a, 0x44, 0xbd, + 0x69, 0x78, 0xef, 0x6b, 0xba, 0x66, 0xf5, 0xc9, 0x84, 0xd5, + 0xfc, 0x52, 0x07, 0x29, 0xc5, 0xf4, 0x70, 0xaa, 0xcf, 0x6a, + 0x46, 0xb2, 0x65, 0x2a, 0xc9, 0x6d, 0x89, 0x89, 0x49, 0xaf, + 0x36, 0xb1, 0xe8, 0xe5, 0xbb, 0x42, 0x96, 0x87, 0x10, 0x94, + 0x45, 0x99, 0x0a, 0x36, 0x7b, 0x2a, 0x03, 0xad, 0xf8, 0xd6, + 0x3a, 0x0b, 0x0f, 0x22, 0x47, 0x5f, 0x03, 0x9b, 0x47, 0xe0, + 0x43, 0x0a, 0x79, 0x2a, 0x0a, 0xf2, 0xa5, 0x57, 0xcf, 0x22, + 0x3f, 0x66, 0x7d, 0x8e, 0x5c, 0x5a, 0xd2, 0x27, 0x86, 0x41, + 0x21, 0x57, 0x9d, 0xed, 0xdc, 0xe4, 0x1a, 0xfd, 0x53, 0x00, + 0x25, 0x92, 0x34, 0x4c, 0x02, 0xd9, 0x43, 0x12, 0x39, 0x07, + 0x3f, 0x8c, 0xd6, 0x65, 0xe0, 0xed, 0xf3, 0xae, 0x6b, 0x7c, + 0x7e, 0xb3, 0x17, 0x8d, 0xce, 0xc7, 0xc8, 0x69, 0x3f, 0x94, + 0x20, 0x21, 0xdf, 0xee, 0x37, 0x57, 0x27, 0x50, 0x30, 0x8a, + 0x21, 0x06, 0x49, 0xbd, 0xbc, 0x9e, 0x0b, 0xb5, 0x0e, 0x37, + 0xda, 0x4d, 0x41, 0xa0, 0xfd, 0xbe, 0xf2, 0x48, 0x70, 0x55, + 0x49, 0x31, 0xb8, 0x74, 0xda, 0x09, 0x67, 0xbb, 0x1c, 0xd8, + 0x51, 0xfb, 0xf8, 0xdd, 0xef, 0x02, 0xb2, 0x84, 0x3e, 0xcb, + 0x6c, 0x51, 0x03, 0xd2, 0xfd, 0xde, 0x48, 0x37, 0x39, 0x2c, + 0xff, 0x1a, 0x94, 0x15, 0x66, 0x3d, 0x99, 0xcb, 0x7b, 0xcd, + 0xfb, 0x77, 0x15, 0xcc, 0x5d, 0x81, 0xc8, 0x28, 0x65, 0x78, + 0x6d, 0xc0, 0x35, 0xa0, 0xd8, 0xaa, 0xea, 0xa0, 0xf4, 0xa6, + 0x85, 0xed, 0x44, 0x8b, 0x5a, 0x08, 0xf6, 0x13, 0x8c, 0x47, + 0xbc, 0x9f, 0xdb, 0x24, 0x25, 0x25, 0xe7, 0x67, 0xda, 0x22, + 0x0a, 0xb7, 0xc2, 0x07, 0xed, 0x49, 0x69, 0x00, 0x25, 0x21, + 0xeb, 0x8e, 0x64, 0x30, 0x6d, 0xc6, 0xb5, 0xf7, 0x75, 0x83, + 0x96, 0xa9, 0xd3, 0x6b, 0x96, 0x37, 0x5c, 0x3f, 0x7b, 0x2a, + 0x68, 0x64, 0x07, 0xec, 0xdd, 0x22, 0x1d, 0xe5, 0x42, 0x2b, + 0xed, 0x49, 0x69, 0x00, 0x35, 0x20, 0xeb, 0x8e, 0x6c, 0x20, + 0x6d, 0xc6, 0xb5, 0xe7, 0x75, 0x03, 0x12, 0xa8, 0xf7, 0x6b, + 0x86, 0x07, 0x5c, 0x3f, 0x7b, 0x2a, 0x68, 0x64, 0x27, 0xee, + 0xdd, 0x22, 0x1d, 0xe5, 0xbd, 0xd4, 0x12, 0xb6, 0x96, 0xff, + 0xca, 0xdf, 0x14, 0x71, 0x93, 0xdf, 0x92, 0x39, 0x4a, 0x18, + 0x8a, 0xfc, 0xed, 0x57, 0x08, 0x94, 0x79, 0xf8, 0xa3, 0xc0, + 0x84, 0xd5, 0x97, 0x9b, 0xd8, 0x11, 0x22, 0xdd, 0xe2, 0x1a, + 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x37, 0x30, + 0x87, 0xb8, 0xd1, 0x6b, 0x60, 0x87, 0x74, 0x62, 0x3a, 0xcd, + 0x13, 0x86, 0xf1, 0x2d, 0x3e, 0xb6, 0x4c, 0x04, 0x70, 0xd9, + 0xbc, 0xc0, 0xd7, 0x77, 0x23, 0xca, 0x07, 0x13, 0x84, 0xe6, + 0xd1, 0xe1, 0x7a, 0x7a, 0xc3, 0xa7, 0x72, 0xc1, 0x9a, 0x72, + 0x48, 0xb3, 0x4d, 0x66, 0xfd, 0xdd, 0x81, 0x56, 0xc8, 0x6f, + 0x47, 0xf5, 0x76, 0x9c, 0xd0, 0xe5, 0x60, 0x12, 0x5d, 0x90, + 0xab, 0xbf, 0xb8, 0x32, 0x2e, 0xc0, 0x97, 0xa7, 0x1f, 0x94, + 0x95, 0xe2, 0x05, 0x00, 0x80, 0x35, 0x03, 0xf8, 0x2e, 0x5f, + 0xb1, 0xde, 0xa7, 0xbf, 0x81, 0xa9, 0xe6, 0xe4, 0x23, 0xe7, + 0x88, 0xac, 0x87, 0x05, 0x33, 0x8c, 0x9b, 0xaa, 0x29, 0x5a, + 0x92, 0xb4, 0x59, 0x8c, 0x94, 0xbe, 0xe7, 0x13, 0x82, 0xef, + 0x8f, 0x87, 0x0d, 0x92, 0xbb, 0xa8, 0xdb, 0x22, 0x0f, 0x92, + 0x19, 0x50, 0xaa, 0xfd, 0x02, 0x04, 0x0b, 0x56, 0x15, 0xd5, + 0x7d, 0x22, 0xa6, 0x70, 0x30, 0xf8, 0x83, 0x31, 0x36, 0x37, + 0xaa, 0x13, 0xb0, 0xf7, 0xbb, 0xad, 0xa8, 0x84, 0x18, 0x40, + 0xad, 0xe7, 0x83, 0x34, 0xec, 0x97, 0x5d, 0x51, 0x49, 0x85, + 0xc9, 0x75, 0xc1, 0xda, 0x49, 0x48, 0xd6, 0x84, 0xb6, 0x7c, + 0x9b, 0x28, 0xae, 0xe4, 0x51, 0xdf, 0x98, 0xec, 0x84, 0xf3, + 0x9a, 0xc4, 0x8a, 0xc0, 0xfa, 0xd8, 0x89, 0xf7, 0x83, 0x26, + 0xca, 0xa7, 0xdf, 0xd5, 0x4e, 0xa6, 0x92, 0x35, 0xc5, 0x72, + 0x25, 0x6e, 0xc2, 0x04, 0xf6, 0x6c, 0x3e, 0x59, 0x89, 0x8c, + 0x5b, 0x5c, 0xb0, 0x97, 0x84, 0xf7, 0xba, 0xc5, 0x8a, 0x80, + 0x3a, 0xda, 0x89, 0xf3, 0x83, 0x26, 0xca, 0x87, 0xdd, 0xd5, + 0x4e, 0xa4, 0x92, 0x75, 0xc5, 0x72, 0x25, 0x4e, 0xc2, 0x04, + 0xf6, 0x6c, 0x3a, 0x59, 0x89, 0xa4, 0x5b, 0x5e, 0xb0, 0x97, + 0x7b, 0x08, 0x45, 0x3a, 0x75, 0x7f, 0xc5, 0x25, 0x76, 0x0c, + 0x7c, 0xd9, 0x35, 0x78, 0x22, 0x2a, 0xb1, 0x5b, 0x6d, 0x8a, + 0x3a, 0x8d, 0xda, 0xb1, 0x3d, 0xfb, 0x09, 0x93, 0xc5, 0xa6, + 0x76, 0x5b, 0xa4, 0xa1, 0x4f, 0x68, 0x9f, 0x12, 0x29, 0xa2, + 0xd0, 0x4a, 0x7a, 0xc4, 0xdb, 0x8f, 0xbc, 0x33, 0x80, 0x1b, + 0x55, 0xb1, 0x91, 0xf3, 0x20, 0xbd, 0x25, 0x9e, 0x26, 0x00, + 0x84, 0xf7, 0x17, 0x6a, 0xbc, 0xda, 0x4a, 0xc8, 0x9b, 0x14, + 0xf6, 0xc5, 0xe2, 0xff, 0xd2, 0x57, 0x2d, 0xd8, 0xd0, 0x45, + 0x87, 0xee, 0xfc, 0x05, 0xa7, 0x7b, 0x64, 0x52, 0x7c, 0xde, + 0xfb, 0x86, 0x11, 0x25, 0xb2, 0x09, 0x4c, 0xa7, 0x35, 0x2d, + 0x67, 0xa4, 0xe6, 0xd0, 0xad, 0x6c, 0xeb, 0xd0, 0x8e, 0x86, + 0x1b, 0xf4, 0x5f, 0x02, 0xd5, 0x8c, 0xc1, 0xf5, 0x48, 0x63, + 0x07, 0x70, 0x68, 0x78, 0x16, 0xca, 0x2c, 0x51, 0xf6, 0x95, + 0xc4, 0x20, 0x0f, 0xa6, 0x13, 0x7b, 0xf9, 0x81, 0x10, 0xd1, + 0x86, 0x66, 0x92, 0xd9, 0x95, 0x64, 0xfc, 0x9e, 0xfa, 0x2e, + 0x71, 0x1c, 0x00, 0xd4, 0x86, 0x91, 0xfd, 0xf7, 0x4b, 0x24, + 0x97, 0x8f, 0x6c, 0x0d, 0x5a, 0x48, 0xc7, 0xc0, 0xa8, 0x4e, + 0xaf, 0xf9, 0x50, 0xa4, 0xa3, 0x48, 0x90, 0xb6, 0x71, 0x41, + 0xb5, 0xea, 0x77, 0x50, 0x72, 0x66, 0x7c, 0xfb, 0xcb, 0x82, + 0x71, 0x8a, 0x01, 0xcb, 0xcf, 0x6a, 0x4b, 0x16, 0x33, 0x50, + 0x27, 0xf8, 0xde, 0xbc, 0xdc, 0xb1, 0x2a, 0xe9, 0x3a, 0xdb, + 0x7a, 0xe9, 0xac, 0x90, 0x24, 0x18, 0x18, 0xb7, 0x37, 0x14, + 0xe6, 0x5c, 0xec, 0xec, 0x9a, 0x5e, 0xa1, 0xa3, 0x01, 0x9b, + 0x4f, 0x7a, 0x5f, 0xdc, 0x2b, 0x78, 0xe0, 0xd8, 0x0e, 0xb1, + 0xd8, 0xa7, 0xea, 0xe9, 0xab, 0x92, 0x22, 0xeb, 0x95, 0xdc, + 0x26, 0x19, 0x9c, 0xb7, 0x37, 0x54, 0xf6, 0x5c, 0xec, 0xec, + 0x8a, 0x16, 0xe1, 0x83, 0x01, 0xdb, 0x4f, 0x7a, 0x5f, 0x9e, + 0x2b, 0x58, 0x60, 0xd8, 0x8e, 0xb1, 0xd8, 0xa7, 0x6a, 0xe9, + 0x2b, 0x92, 0x22, 0xeb, 0x94, 0xdc, 0x26, 0x08, 0x63, 0x48, + 0xc8, 0xab, 0x09, 0xa3, 0x13, 0x13, 0x75, 0xe9, 0x1e, 0x7c, + 0xfe, 0x24, 0xb0, 0x85, 0xa0, 0x61, 0xd4, 0xa7, 0x9f, 0x27, + 0x71, 0x4e, 0x27, 0x58, 0x95, 0x16, 0xd4, 0x6d, 0xdd, 0x14, + 0x6b, 0x23, 0xd9, 0xf7, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0xac, 0xab, 0x1b, 0x4d, 0x4b, 0x09, 0x00, 0x7c, + 0x35, 0x9c, 0x96, 0x3f, 0x7b, 0x0a, 0x7b, 0x5b, 0xbe, 0x6b, + 0x2b, 0x18, 0x85, 0x78, 0x28, 0x62, 0x3d, 0x42, 0x2c, 0x48, + 0x22, 0x69, 0x4b, 0xaf, 0xa9, 0xa4, 0x1d, 0xb5, 0x60, 0xeb, + 0x29, 0x4f, 0xc6, 0x73, 0x44, 0x78, 0x46, 0x62, 0xeb, 0x40, + 0xfe, 0xf7, 0x05, 0x44, 0xe6, 0xc3, 0xf6, 0x68, 0x70, 0x6d, + 0x7a, 0x41, 0xc8, 0x18, 0x6f, 0xc1, 0x5e, 0x4b, 0x46, 0xad, + 0x55, 0x4e, 0xd1, 0x40, 0x1e, 0x80, 0x5f, 0x0d, 0xbf, 0xe5, + 0x4e, 0x79, 0x10, 0x68, 0x39, 0xfb, 0x27, 0x1b, 0x25, 0x25, + 0x99, 0xd3, 0x26, 0xed, 0x83, 0xfd, 0x75, 0x94, 0xf9, 0xe3, + 0xca, 0x09, 0xa7, 0xd9, 0xa6, 0xc6, 0x9f, 0x26, 0xc9, 0x0a, + 0xe5, 0x42, 0xea, 0x88, 0x85, 0xf8, 0xbf, 0xc6, 0xcb, 0xb5, + 0x05, 0x1b, 0xfb, 0x89, 0xef, 0x1c, 0xd6, 0x91, 0x62, 0xef, + 0x26, 0x52, 0x13, 0x3c, 0x94, 0x8b, 0xd9, 0x93, 0x37, 0x16, + 0xb0, 0xa9, 0x17, 0x89, 0x95, 0xa4, 0xc9, 0xa6, 0x44, 0x9f, + 0x23, 0x54, 0xa7, 0x66, 0x21, 0x79, 0x66, 0x39, 0x2a, 0x95, + 0x25, 0xb1, 0x6d, 0x8c, 0x3f, 0x1e, 0x90, 0x94, 0x2f, 0xe1, + 0x5a, 0x40, 0xb5, 0x18, 0x67, 0x51, 0x4a, 0xec, 0xf0, 0xae, + 0xc3, 0xb6, 0xcc, 0xcc, 0xc8, 0x9e, 0xe1, 0x56, 0x8f, 0xf6, + 0x73, 0xbb, 0x70, 0x6b, 0x68, 0x18, 0xad, 0x16, 0x3c, 0x84, + 0x6e, 0xc5, 0x98, 0x50, 0x2f, 0x3b, 0x9f, 0x0a, 0x98, 0x19, + 0x77, 0x79, 0x4a, 0xac, 0xd3, 0xef, 0xc4, 0xe1, 0xcd, 0xcc, + 0xcc, 0x9f, 0xe1, 0x54, 0x8f, 0xf6, 0x73, 0xb9, 0x70, 0x6b, + 0x68, 0x18, 0xad, 0x14, 0x3c, 0x84, 0x7e, 0xcd, 0x90, 0x50, + 0x2f, 0x72, 0x9e, 0x0b, 0x98, 0x19, 0x67, 0x79, 0x4a, 0xac, + 0xd3, 0xee, 0xc5, 0xa0, 0x32, 0x33, 0x33, 0x60, 0x1e, 0xab, + 0x70, 0x09, 0x8c, 0x46, 0x8f, 0x94, 0x97, 0xe7, 0x52, 0xeb, + 0xc3, 0x7b, 0x81, 0x32, 0x6f, 0xaf, 0xd0, 0x8d, 0x61, 0xf4, + 0x67, 0xe6, 0x98, 0x86, 0xb5, 0x53, 0x2c, 0x11, 0x3a, 0x5f, + 0x7d, 0xdf, 0x3e, 0xa5, 0x71, 0x07, 0xbd, 0x83, 0x2b, 0x76, + 0x2d, 0xd9, 0x10, 0x4c, 0x3f, 0xda, 0x5c, 0x20, 0xab, 0x77, + 0x67, 0x93, 0x85, 0xd5, 0x10, 0x6d, 0x4b, 0xaf, 0x32, 0x3a, + 0x4d, 0x79, 0x2b, 0xbc, 0x6c, 0xd7, 0x9d, 0x18, 0x35, 0x0a, + 0xe3, 0x8b, 0x5d, 0xa2, 0x47, 0x43, 0xd2, 0xd7, 0xe6, 0xb3, + 0xe6, 0x47, 0xe4, 0x74, 0xae, 0x7d, 0x74, 0xd1, 0x74, 0x64, + 0x0d, 0xe7, 0xc6, 0xc2, 0xcf, 0x2e, 0x78, 0xa6, 0xd5, 0xa2, + 0x0b, 0x32, 0x57, 0x13, 0xbf, 0xc1, 0x6b, 0x4e, 0x63, 0x24, + 0xd1, 0x8e, 0x53, 0x09, 0x48, 0x85, 0x2d, 0x3f, 0x0e, 0x15, + 0xd8, 0x20, 0x69, 0xb2, 0x06, 0xe4, 0xca, 0x11, 0x40, 0xdb, + 0x48, 0xef, 0x01, 0x5f, 0xbe, 0x8e, 0x31, 0xf6, 0x93, 0x1d, + 0xa7, 0x0f, 0x7a, 0x41, 0x7e, 0x00, 0x4c, 0x65, 0xb0, 0x2b, + 0x4a, 0xed, 0x1b, 0xe2, 0x80, 0x4a, 0x1c, 0x8b, 0x92, 0x66, + 0x20, 0x36, 0x23, 0xfe, 0x28, 0x14, 0xb6, 0x00, 0x6a, 0x7d, + 0xfd, 0x61, 0x18, 0x44, 0x0f, 0xcb, 0x28, 0xc6, 0x9f, 0x27, + 0x5d, 0xa2, 0x1c, 0xf2, 0x6c, 0x45, 0x28, 0x67, 0xdd, 0xbe, + 0x7f, 0xf4, 0x9e, 0x2d, 0x25, 0xd3, 0xa7, 0xad, 0x60, 0xce, + 0x4f, 0x4e, 0xa8, 0x3b, 0x6d, 0x37, 0xb6, 0x36, 0x94, 0x0c, + 0x8e, 0x39, 0xf2, 0xa2, 0xdf, 0x39, 0x57, 0xb0, 0x6d, 0x7e, + 0x8b, 0x41, 0x69, 0xd3, 0xdd, 0xee, 0x14, 0x70, 0x3a, 0x7d, + 0x21, 0xd0, 0x83, 0x85, 0x20, 0xec, 0x02, 0x4c, 0xae, 0x36, + 0x69, 0xbe, 0x16, 0x3f, 0x90, 0x17, 0x8e, 0x19, 0xf2, 0x22, + 0xdf, 0x39, 0x57, 0xb0, 0x4d, 0x7e, 0x8a, 0x41, 0x6b, 0xd3, + 0xdd, 0xee, 0x14, 0x70, 0x3a, 0x3d, 0x21, 0xd1, 0x8b, 0xa4, + 0x20, 0xcc, 0x02, 0x4c, 0xac, 0x36, 0x69, 0xbe, 0x16, 0x3f, + 0x90, 0x17, 0x71, 0xe6, 0x0d, 0xdd, 0x20, 0xc6, 0xa8, 0x4f, + 0xb2, 0x81, 0x75, 0xbe, 0x94, 0x2c, 0x22, 0x11, 0xeb, 0x8f, + 0xc5, 0xc2, 0xde, 0x2e, 0x7c, 0x5b, 0xdf, 0x33, 0xfd, 0xb3, + 0x53, 0xc9, 0x96, 0x41, 0xe9, 0xc0, 0x6f, 0xe8, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1b, 0xd1, 0x2c, 0xe5, + 0xf5, 0xbc, 0x58, 0x46, 0x30, 0xa8, 0xaa, 0x6f, 0xfe, 0x37, + 0x2c, 0x5c, 0xd3, 0xb0, 0x2d, 0x72, 0x3f, 0x29, 0xbd, 0xe5, + 0x53, 0x02, 0x3e, 0x51, 0xec, 0x64, 0x98, 0xe2, 0x6f, 0xfa, + 0x73, 0x17, 0xf2, 0x84, 0x08, 0x4b, 0x51, 0xb1, 0xbf, 0xf1, + 0xc5, 0x81, 0x01, 0x8a, 0xb4, 0x78, 0x42, 0x06, 0x62, 0x09, + 0x9b, 0x93, 0x7f, 0xaa, 0xc6, 0xb1, 0x49, 0x66, 0x44, 0x00, + 0xa0, 0xd5, 0x04, 0x34, 0x1d, 0x06, 0x4f, 0xc9, 0xea, 0x5c, + 0x7d, 0x7b, 0x22, 0x96, 0xc8, 0x26, 0x93, 0xe7, 0x5a, 0xc0, + 0xd4, 0x67, 0x1c, 0x39, 0xf1, 0x7f, 0x86, 0x4e, 0xd3, 0x76, + 0xd9, 0x62, 0xcc, 0x7c, 0x6f, 0xc3, 0x9a, 0x61, 0xf7, 0xb3, + 0x23, 0x67, 0x25, 0x46, 0xea, 0xcf, 0x62, 0x04, 0x8e, 0x9d, + 0x16, 0xac, 0x93, 0xec, 0x38, 0x41, 0xa5, 0xd1, 0xfc, 0x77, + 0x51, 0x11, 0x9f, 0x55, 0x2a, 0x4e, 0x2d, 0xd9, 0x4d, 0xb7, + 0xed, 0x50, 0x69, 0x9e, 0x94, 0x94, 0x1c, 0x50, 0x1e, 0xbe, + 0x28, 0x51, 0x32, 0x72, 0x8a, 0xa2, 0xfb, 0xe1, 0x27, 0x05, + 0xc6, 0x56, 0x2a, 0x22, 0xfd, 0xac, 0x4f, 0x78, 0x13, 0xca, + 0xda, 0x10, 0x07, 0x92, 0x50, 0xf4, 0xd9, 0x5a, 0x02, 0x03, + 0x9d, 0x8e, 0x94, 0x56, 0x2a, 0xc0, 0xa0, 0x54, 0x31, 0x52, + 0x8a, 0xa3, 0xde, 0x68, 0x97, 0x1f, 0x6c, 0x66, 0xad, 0x83, + 0x9c, 0xfc, 0x1c, 0xb9, 0x61, 0x21, 0xca, 0xdc, 0x05, 0xda, + 0x49, 0x50, 0xf0, 0x5c, 0x1f, 0xf0, 0x1c, 0x9d, 0x80, 0x1a, + 0x80, 0x2b, 0xa0, 0x50, 0x30, 0x52, 0x8a, 0xa3, 0xde, 0xe8, + 0x97, 0x1f, 0x6c, 0x66, 0xad, 0x83, 0x9c, 0xfc, 0x5d, 0xb9, + 0x61, 0x21, 0xca, 0xdc, 0x05, 0xda, 0x41, 0x50, 0xf9, 0x5c, + 0x1f, 0xf0, 0x9c, 0x9c, 0x14, 0x12, 0x08, 0x2b, 0x5f, 0xaf, + 0xcf, 0xad, 0x75, 0x5c, 0x21, 0x17, 0x68, 0xe0, 0x93, 0x99, + 0x52, 0x7c, 0x63, 0x03, 0xa2, 0x46, 0x9e, 0xde, 0x35, 0x23, + 0xfa, 0x25, 0xbe, 0xaf, 0x06, 0xa3, 0xe0, 0x0f, 0x63, 0x63, + 0xeb, 0xed, 0xf7, 0xd4, 0x69, 0x1a, 0x0a, 0x90, 0x22, 0x49, + 0x3c, 0x6b, 0xb2, 0x1b, 0xe8, 0x9e, 0x45, 0x99, 0xb1, 0x2c, + 0x17, 0x5c, 0x01, 0xb4, 0x58, 0xa9, 0x7b, 0x49, 0x2d, 0xab, + 0xd7, 0x25, 0x7b, 0x99, 0xa0, 0x97, 0x8a, 0xc3, 0x7b, 0x7b, + 0x18, 0xcc, 0x39, 0x37, 0xc0, 0x2e, 0x2d, 0xbc, 0xe9, 0x24, + 0xd8, 0xc8, 0x2b, 0x01, 0xb9, 0xe2, 0x06, 0x27, 0x71, 0x7c, + 0xef, 0x08, 0x9a, 0x2d, 0x42, 0xb7, 0x6f, 0x4b, 0x86, 0x93, + 0xfa, 0xeb, 0xf7, 0x9e, 0x75, 0x3f, 0xea, 0x7b, 0xbc, 0xbe, + 0xb2, 0x5e, 0xe9, 0x69, 0xa8, 0x60, 0x46, 0x05, 0xb5, 0xaa, + 0x2d, 0xd3, 0xfa, 0x47, 0x6e, 0x48, 0xed, 0x9f, 0xd8, 0x33, + 0xe1, 0xf8, 0xda, 0x57, 0x6a, 0xf4, 0x8e, 0x18, 0x2c, 0xf0, + 0x1a, 0xe8, 0x67, 0xc9, 0x03, 0x2c, 0x23, 0x46, 0xd1, 0x5b, + 0xac, 0x4c, 0xd5, 0x4a, 0x2c, 0xfb, 0x9c, 0x2e, 0x27, 0xd3, + 0x51, 0x94, 0x14, 0x25, 0x4a, 0xaf, 0x7e, 0xb8, 0xbe, 0x0b, + 0x3f, 0x3d, 0x33, 0x71, 0x15, 0x2b, 0x13, 0xf9, 0x42, 0x37, + 0xb6, 0xe9, 0x3c, 0x1b, 0x26, 0xe2, 0x32, 0xd8, 0xaa, 0x05, + 0xe5, 0x27, 0xc3, 0x9f, 0x80, 0x03, 0xf8, 0x55, 0x50, 0x1c, + 0x31, 0x74, 0x85, 0x29, 0x98, 0x71, 0xcd, 0x75, 0xf8, 0xf3, + 0xf5, 0xd6, 0x65, 0xb4, 0x4a, 0x1c, 0x2f, 0xed, 0xa5, 0xfb, + 0x60, 0xe1, 0x70, 0x91, 0xb2, 0x07, 0x3f, 0x63, 0x89, 0xdd, + 0xce, 0xc9, 0x79, 0x5e, 0x74, 0x01, 0x10, 0x75, 0x03, 0x3c, + 0xbc, 0x25, 0x06, 0x0d, 0x2d, 0xf1, 0x15, 0x82, 0xe9, 0x36, + 0x4a, 0x1c, 0x2f, 0xed, 0xac, 0x7b, 0x60, 0xe1, 0x70, 0xd0, + 0xb2, 0x07, 0x37, 0x63, 0x89, 0xdf, 0xc6, 0xcb, 0x78, 0x5e, + 0x74, 0x01, 0x10, 0x75, 0x03, 0x38, 0xbc, 0x27, 0x07, 0x0d, + 0x2d, 0xf1, 0x15, 0x82, 0x69, 0x36, 0xb5, 0xe3, 0xd0, 0x12, + 0x53, 0x84, 0x9f, 0x1e, 0x8f, 0x2f, 0x4d, 0xf8, 0xc8, 0x9c, + 0x76, 0x20, 0x39, 0x34, 0x87, 0xa1, 0x8b, 0xfe, 0xef, 0x8a, + 0xfc, 0xc7, 0x43, 0xd8, 0xf8, 0xf2, 0xd2, 0x0e, 0xea, 0x7d, + 0x96, 0xc9, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00 +}; + diff --git a/neureka/depthwise/Makefile b/neureka/depthwise/Makefile new file mode 100644 index 0000000..9c8efb5 --- /dev/null +++ b/neureka/depthwise/Makefile @@ -0,0 +1,3 @@ +include ../app/Makefile + +STIM_DIR := ../depthwise/ \ No newline at end of file diff --git a/neureka/depthwise/inc/bias.h b/neureka/depthwise/inc/bias.h new file mode 100644 index 0000000..7466de0 --- /dev/null +++ b/neureka/depthwise/inc/bias.h @@ -0,0 +1,9 @@ +#ifndef __BIAS_H__ +#define __BIAS_H__ + +#include + +#define BIAS_SIZE (67) +extern int32_t bias[BIAS_SIZE]; + +#endif // __BIAS_H__ diff --git a/neureka/depthwise/inc/input.h b/neureka/depthwise/inc/input.h new file mode 100644 index 0000000..ddb3845 --- /dev/null +++ b/neureka/depthwise/inc/input.h @@ -0,0 +1,9 @@ +#ifndef __INPUT_H__ +#define __INPUT_H__ + +#include + +#define INPUT_SIZE (1407) +extern uint8_t input[INPUT_SIZE]; + +#endif // __INPUT_H__ diff --git a/neureka/depthwise/inc/layer_conf.h b/neureka/depthwise/inc/layer_conf.h new file mode 100644 index 0000000..a1e9326 --- /dev/null +++ b/neureka/depthwise/inc/layer_conf.h @@ -0,0 +1,42 @@ +#ifndef __LAYER_CONF_H__ +#define __LAYER_CONF_H__ + +#define TEST_NAME "test" +#define INPUT_HEIGHT (7) +#define INPUT_WIDTH (3) +#define INPUT_CHANNEL (67) +#define INPUT_SIGNED (0) +#define INPUT_BITS (8) + +#define OUTPUT_HEIGHT (5) +#define OUTPUT_WIDTH (1) +#define OUTPUT_CHANNEL (67) +#define OUTPUT_BITS (8) + +#define WEIGHT_HEIGHT (3) +#define WEIGHT_WIDTH (3) +#define WEIGHT_CHANNEL_IN (1) +#define WEIGHT_CHANNEL_OUT (67) +#define WEIGHT_BITS (8) +#define WEIGHT_OFFSET (-128) + +#define SCALE_BITS (8) + +#define BIAS_BITS (32) + +#define PADDING_TOP (0) +#define PADDING_BOTTOM (0) +#define PADDING_LEFT (0) +#define PADDING_RIGHT (0) +#define PADDING_VALUE (0) + +#define STRIDE_HEIGHT (1) +#define STRIDE_WIDTH (1) + +#define GROUPS (67) +#define OUTSHIFT (10) +#define HAS_NORM_QUANT (1) +#define HAS_BIAS (1) +#define HAS_RELU (1) + +#endif // __LAYER_CONF_H__ diff --git a/neureka/depthwise/inc/output.h b/neureka/depthwise/inc/output.h new file mode 100644 index 0000000..43b9b83 --- /dev/null +++ b/neureka/depthwise/inc/output.h @@ -0,0 +1,14 @@ +#ifndef __OUTPUT_H__ +#define __OUTPUT_H__ + +#include + +#define OUTPUT_SIZE (335) +extern uint8_t output[OUTPUT_SIZE]; + +#define GOLDEN_OUTPUT_SIZE (335) +extern uint8_t golden_output[GOLDEN_OUTPUT_SIZE]; + +int check_output(); + +#endif // __OUTPUT_H__ diff --git a/neureka/depthwise/inc/scale.h b/neureka/depthwise/inc/scale.h new file mode 100644 index 0000000..cced0eb --- /dev/null +++ b/neureka/depthwise/inc/scale.h @@ -0,0 +1,9 @@ +#ifndef __SCALE_H__ +#define __SCALE_H__ + +#include + +#define SCALE_SIZE (67) +extern uint8_t scale[SCALE_SIZE]; + +#endif // __SCALE_H__ diff --git a/neureka/depthwise/inc/weight.h b/neureka/depthwise/inc/weight.h new file mode 100644 index 0000000..82efe1e --- /dev/null +++ b/neureka/depthwise/inc/weight.h @@ -0,0 +1,9 @@ +#ifndef __WEIGHT_H__ +#define __WEIGHT_H__ + +#include + +#define WEIGHT_SIZE (864) +extern uint8_t weight[WEIGHT_SIZE]; + +#endif // __WEIGHT_H__ diff --git a/neureka/depthwise/src/bias.c b/neureka/depthwise/src/bias.c new file mode 100644 index 0000000..32abcea --- /dev/null +++ b/neureka/depthwise/src/bias.c @@ -0,0 +1,13 @@ +#include "bias.h" + +#define BIAS_SIZE (67) +PI_L1 int32_t bias[BIAS_SIZE] = { + 0x13874, 0x26f67, -0x101fd, 0x2f1b9, -0x1b8ff, 0x1ff5c, -0x109c8, -0x378c4, 0xa964, 0x14edd, + -0x18c81, 0x349c8, 0x1fcf, -0x34b55, -0x11ff6, 0x343b6, -0x38662, 0x304d8, 0x103ff, 0x16f6f, + -0x1cc56, 0x1fad4, 0x2ca7e, -0x4851, -0x8866, 0x840c, -0x7ef2, 0x3906f, -0x22218, -0x298f, + 0x3705d, 0x06, -0xf641, -0x3d441, -0x38f1f, 0x37eb9, 0x1b5ce, -0x16e86, 0x32a32, -0xb606, + -0x129fc, 0x3812, 0x2ee55, 0x3b457, -0x16ad9, -0xb971, -0x4571, -0x784b, -0x15e15, -0x2bdda, + -0x22b2b, -0x3a365, -0x27563, 0x3021a, -0x173c7, 0x2c311, -0x2c268, -0x2dee9, -0x165f6, -0x3e422, + -0x15221, 0x3ed37, 0x164d9, -0x167b, -0x34c83, -0xcb89, 0x262d5 +}; + diff --git a/neureka/depthwise/src/input.c b/neureka/depthwise/src/input.c new file mode 100644 index 0000000..7d83843 --- /dev/null +++ b/neureka/depthwise/src/input.c @@ -0,0 +1,147 @@ +#include "input.h" + +#define INPUT_SIZE (1407) +PI_L1 uint8_t input[INPUT_SIZE] = { + 0xc2, 0x2c, 0xe2, 0x81, 0x81, 0x2d, 0xa9, 0xb3, 0x4c, 0x8a, + 0x32, 0x75, 0x40, 0xc1, 0xed, 0x7a, 0x9a, 0x7d, 0x5e, 0xdf, + 0xc8, 0x44, 0x83, 0xe8, 0x1e, 0x1b, 0xd2, 0x84, 0x39, 0xc5, + 0x2c, 0x27, 0x9b, 0x78, 0xb3, 0x70, 0x5a, 0x5b, 0xb5, 0x95, + 0x3c, 0x8f, 0xe4, 0x32, 0xfa, 0x12, 0xb7, 0xc7, 0x48, 0x91, + 0x02, 0x33, 0xd0, 0xbe, 0x57, 0xae, 0x61, 0xd0, 0x8a, 0x5d, + 0xd1, 0x72, 0x25, 0x0a, 0x81, 0x35, 0xd7, 0x36, 0xec, 0x04, + 0x37, 0xda, 0x99, 0xf4, 0x28, 0x46, 0xc8, 0xdd, 0x25, 0xae, + 0x37, 0x78, 0xf2, 0x15, 0x48, 0x57, 0xc2, 0x5b, 0xe1, 0x63, + 0x07, 0xab, 0xac, 0x7c, 0x2a, 0xe0, 0xbc, 0x75, 0xa8, 0xf0, + 0x48, 0x74, 0xe1, 0xad, 0xce, 0x17, 0x21, 0xf1, 0x72, 0x3d, + 0x08, 0x00, 0x44, 0x04, 0x46, 0xc7, 0x35, 0x39, 0x5f, 0xf9, + 0xdc, 0xad, 0x25, 0x23, 0xd5, 0x0f, 0xf0, 0x5c, 0x23, 0x08, + 0x25, 0x82, 0xd9, 0x76, 0xe9, 0xd1, 0x10, 0x9b, 0x13, 0x38, + 0x67, 0x8b, 0x96, 0x80, 0x7f, 0xe8, 0x58, 0xbf, 0x9e, 0xf3, + 0x34, 0x20, 0x06, 0xb6, 0xcd, 0x80, 0x3f, 0xf5, 0x2b, 0x95, + 0x90, 0x66, 0xa7, 0xc3, 0xf0, 0x01, 0x13, 0x11, 0x09, 0xbd, + 0x94, 0xfe, 0x52, 0x98, 0xda, 0x19, 0x19, 0x1a, 0x47, 0xca, + 0x5f, 0x7b, 0x64, 0xd8, 0x15, 0xb1, 0xcd, 0x3a, 0xe0, 0x29, + 0x61, 0x93, 0xa6, 0xb9, 0x99, 0x3b, 0x66, 0x46, 0x37, 0x4f, + 0x64, 0x87, 0xb3, 0xe3, 0x1c, 0xdf, 0x20, 0xc4, 0x48, 0xca, + 0x9c, 0xa8, 0x48, 0xd0, 0x52, 0x06, 0x3d, 0x87, 0x29, 0x23, + 0x99, 0x68, 0xed, 0x0a, 0x5c, 0xee, 0x5a, 0xec, 0xd6, 0x6e, + 0x65, 0x75, 0xa7, 0x0f, 0x17, 0x4b, 0xa0, 0xcf, 0xc4, 0xdc, + 0x28, 0xc2, 0x69, 0x4a, 0x61, 0x24, 0x55, 0xa2, 0x0c, 0xcd, + 0x6c, 0x79, 0xf8, 0xa1, 0xc9, 0x39, 0xc0, 0xe2, 0xc8, 0x00, + 0xe5, 0xbe, 0xf0, 0x81, 0x2c, 0xc4, 0xa9, 0x0c, 0xdf, 0xce, + 0xee, 0xe8, 0xb8, 0x62, 0x49, 0x07, 0xaa, 0x3c, 0xed, 0x30, + 0xd9, 0x77, 0x10, 0xd7, 0x32, 0x5e, 0x3d, 0xef, 0x27, 0x13, + 0x50, 0xb5, 0xb5, 0x21, 0x76, 0x14, 0x8f, 0x77, 0x7d, 0xb1, + 0xf8, 0xc3, 0x45, 0xfd, 0x35, 0x59, 0xa4, 0xca, 0xca, 0xbd, + 0x8d, 0xea, 0x83, 0x78, 0xa2, 0x6b, 0x11, 0xef, 0xae, 0x9d, + 0x1f, 0x9d, 0xcc, 0x98, 0x89, 0x22, 0x8b, 0x7d, 0xe8, 0xa7, + 0x61, 0x9e, 0x0c, 0xe3, 0x7e, 0xcc, 0x6d, 0x19, 0xbc, 0x71, + 0x8c, 0x01, 0x2a, 0x03, 0xa7, 0x81, 0x99, 0x12, 0x93, 0xc7, + 0x18, 0x3d, 0x66, 0x37, 0x98, 0x08, 0xf2, 0xb6, 0x0b, 0xa2, + 0x89, 0x65, 0x34, 0x07, 0x6e, 0x09, 0x84, 0xfe, 0x73, 0xf8, + 0x96, 0xbd, 0x09, 0x5c, 0x47, 0x1e, 0x0e, 0xa9, 0x58, 0xe7, + 0x5d, 0xc1, 0xdb, 0xe8, 0x67, 0x40, 0x21, 0x2b, 0x6a, 0x00, + 0x49, 0x57, 0xd1, 0x67, 0x18, 0xfa, 0x79, 0x87, 0xd1, 0x45, + 0x5a, 0xbb, 0x43, 0x3b, 0x2f, 0xd9, 0xbe, 0x8b, 0x61, 0x1f, + 0xc9, 0xa9, 0xe9, 0x10, 0xcb, 0x5b, 0x24, 0x82, 0x30, 0x5a, + 0x77, 0xe4, 0x2f, 0x40, 0x67, 0x55, 0xd1, 0x84, 0x29, 0x91, + 0x6f, 0x4b, 0x21, 0x94, 0xaa, 0x1f, 0x50, 0x1c, 0xc0, 0xb9, + 0x19, 0x0a, 0xd2, 0xe3, 0x7f, 0x91, 0x4d, 0x26, 0x93, 0x3b, + 0x01, 0xfd, 0x69, 0xba, 0x5d, 0xd0, 0x2b, 0x53, 0x6e, 0xd6, + 0x0f, 0x95, 0xde, 0x25, 0xbb, 0x3a, 0x6b, 0x36, 0x50, 0xa3, + 0xf9, 0x3d, 0x4d, 0xb2, 0x59, 0x49, 0xd1, 0xf3, 0x5c, 0x4e, + 0xd4, 0xb9, 0x6d, 0x1a, 0x72, 0x5e, 0x42, 0x92, 0x5b, 0xc8, + 0xdc, 0x89, 0x28, 0xe2, 0xae, 0xee, 0x61, 0xe5, 0x79, 0x3e, + 0x10, 0x7c, 0x62, 0x2b, 0x9b, 0xf6, 0x0c, 0x6e, 0x61, 0xc1, + 0xf1, 0x34, 0x77, 0x52, 0x0f, 0xa2, 0xce, 0x21, 0x27, 0x62, + 0xc6, 0xb3, 0xc1, 0x74, 0x43, 0xeb, 0xcb, 0x74, 0x82, 0x91, + 0xf9, 0x38, 0x81, 0x76, 0x91, 0x50, 0x8e, 0x96, 0x73, 0x14, + 0x61, 0x62, 0x95, 0x6d, 0x01, 0x16, 0x24, 0xb2, 0x66, 0x30, + 0x0e, 0x33, 0x47, 0x37, 0xa4, 0xc8, 0x0f, 0xc7, 0x1d, 0xed, + 0xc4, 0x01, 0x54, 0xa2, 0xbb, 0xfe, 0x15, 0x0c, 0x0d, 0x64, + 0xd5, 0x09, 0x59, 0xf4, 0x3d, 0x46, 0xfb, 0x0c, 0xc6, 0x6d, + 0x46, 0x2e, 0x84, 0x28, 0x4a, 0xed, 0x33, 0xad, 0xfd, 0x9b, + 0xfd, 0x99, 0xd8, 0x17, 0x89, 0xe0, 0x42, 0x71, 0x39, 0xba, + 0xa1, 0xfe, 0x23, 0xf3, 0x88, 0x95, 0xc3, 0x3a, 0xb8, 0x5b, + 0x0b, 0x5c, 0x0c, 0x23, 0x64, 0x4b, 0xe9, 0x0d, 0x0f, 0x33, + 0xd2, 0x20, 0x0d, 0x1f, 0x7a, 0x39, 0x2d, 0x8f, 0xc5, 0xd1, + 0x48, 0x85, 0xba, 0x06, 0x4c, 0xbe, 0xd4, 0xfb, 0x8d, 0x6a, + 0xd3, 0xe5, 0x3b, 0xeb, 0xa0, 0xfa, 0x1b, 0x30, 0xd3, 0x08, + 0x26, 0x2e, 0x0b, 0x11, 0x43, 0x46, 0xba, 0x8d, 0x4d, 0xf2, + 0x6e, 0xc0, 0x99, 0x63, 0xc8, 0x54, 0x0f, 0xbe, 0xcd, 0xd0, + 0x65, 0xc0, 0xad, 0x19, 0xcd, 0xdb, 0x9b, 0x29, 0x3a, 0xef, + 0xc6, 0x0a, 0x0d, 0xd1, 0xe7, 0x0a, 0xbe, 0x6d, 0xb9, 0x46, + 0x76, 0xdc, 0x51, 0x63, 0xbf, 0x08, 0x41, 0x47, 0xbc, 0x30, + 0x50, 0x00, 0xf8, 0xc6, 0x92, 0x30, 0xec, 0xf0, 0x31, 0xa2, + 0x4e, 0xac, 0xc8, 0x63, 0x14, 0x05, 0x62, 0x22, 0x5e, 0x29, + 0x6b, 0x5e, 0x73, 0xdb, 0x7e, 0x5a, 0xf8, 0x9f, 0xd6, 0xfc, + 0x05, 0x50, 0x14, 0xa9, 0xf3, 0x66, 0xe5, 0x92, 0xad, 0x1f, + 0xe4, 0x65, 0x8b, 0x3b, 0xf5, 0x46, 0x13, 0x71, 0x19, 0x2a, + 0xc2, 0xb4, 0x51, 0xb8, 0xd7, 0x99, 0x23, 0xe5, 0x2b, 0xb2, + 0xfc, 0xb9, 0x1e, 0xd9, 0x5a, 0x3c, 0x7d, 0x4d, 0x33, 0x61, + 0x2a, 0xdf, 0x1a, 0xb5, 0x1f, 0x6a, 0x82, 0x9b, 0xcc, 0xe0, + 0x90, 0x6a, 0x6d, 0x41, 0x11, 0xe7, 0x5c, 0xf1, 0x09, 0x3d, + 0x04, 0xe0, 0x61, 0x64, 0xba, 0x01, 0x89, 0xa4, 0xee, 0x49, + 0x6f, 0x16, 0x8d, 0x72, 0x51, 0x87, 0x1a, 0x56, 0x88, 0x5a, + 0x3c, 0x8f, 0xc6, 0xe9, 0xc8, 0xb2, 0xf4, 0x3b, 0x56, 0x19, + 0x8c, 0x44, 0x04, 0xe4, 0xa6, 0xb2, 0x05, 0xdd, 0x2b, 0xc0, + 0x26, 0x92, 0x6a, 0xe0, 0xe9, 0x43, 0xb4, 0xd6, 0x31, 0x03, + 0xfc, 0xd8, 0x60, 0xd8, 0xbe, 0x5d, 0x09, 0x5b, 0xea, 0x2e, + 0x3d, 0x19, 0x6a, 0x0b, 0xdb, 0xfc, 0x1a, 0x4c, 0xe7, 0xc4, + 0xe0, 0x3a, 0xed, 0xa3, 0xf3, 0x05, 0x29, 0x59, 0xf7, 0x49, + 0x45, 0x09, 0x82, 0xb9, 0x54, 0x64, 0x61, 0x5e, 0x73, 0x03, + 0xa5, 0xe1, 0x5a, 0x7f, 0x27, 0x3f, 0x9c, 0xf6, 0x11, 0x38, + 0xa3, 0x09, 0xc1, 0x2e, 0x2b, 0xb4, 0xed, 0x5a, 0xc9, 0x4b, + 0x9b, 0x4b, 0x9a, 0xfa, 0x73, 0xe5, 0xa1, 0xc2, 0x9c, 0xfb, + 0x7f, 0x15, 0x54, 0xc0, 0xfe, 0x13, 0xa6, 0x1d, 0x0a, 0xed, + 0xb2, 0x79, 0xcb, 0xcc, 0x04, 0xb8, 0x05, 0x99, 0x62, 0x98, + 0x43, 0xe8, 0x7d, 0xd0, 0xd1, 0xb1, 0xc1, 0x81, 0xb2, 0xe1, + 0x17, 0x2e, 0xd4, 0x54, 0x90, 0x65, 0xf9, 0x5b, 0xc4, 0xc8, + 0xee, 0x9a, 0x14, 0x0a, 0x72, 0x65, 0xa5, 0x3c, 0xc5, 0x2d, + 0xd5, 0xc1, 0x7d, 0x3f, 0x48, 0xd5, 0x41, 0xe6, 0x1e, 0x55, + 0x53, 0xa6, 0x7e, 0x3d, 0xd0, 0xe5, 0xf9, 0xb2, 0x55, 0x2b, + 0x21, 0xad, 0xac, 0x18, 0xdf, 0x7a, 0x3a, 0xc4, 0x5e, 0x62, + 0x94, 0x0b, 0x0d, 0xfd, 0x33, 0x0b, 0x59, 0x53, 0xaa, 0xbd, + 0xc6, 0x26, 0x6f, 0xae, 0xb8, 0x29, 0x43, 0xc4, 0x6a, 0x61, + 0x2d, 0x7f, 0x3e, 0x39, 0x81, 0xd0, 0xad, 0x34, 0xa9, 0x50, + 0x69, 0x46, 0x07, 0x69, 0xf3, 0xd3, 0x74, 0x74, 0x65, 0xb1, + 0x5d, 0x90, 0x8c, 0xb6, 0x39, 0xd5, 0xd1, 0x1b, 0x73, 0xd7, + 0xd4, 0x5d, 0xbb, 0x81, 0x37, 0x86, 0x29, 0x6b, 0x81, 0x1b, + 0x56, 0xb2, 0x90, 0x35, 0xc1, 0xc1, 0x8c, 0x64, 0x59, 0x3a, + 0x6b, 0xa2, 0x35, 0x33, 0x77, 0xf4, 0x14, 0xda, 0xbd, 0x92, + 0x3b, 0xa0, 0x9c, 0x68, 0xce, 0xb9, 0x6e, 0x58, 0x5a, 0x56, + 0x93, 0x26, 0x41, 0x0b, 0x23, 0x18, 0x35, 0x56, 0x6e, 0x1e, + 0x87, 0x16, 0x81, 0xca, 0x03, 0x5e, 0x43, 0x94, 0xec, 0xc5, + 0xdd, 0xb9, 0x81, 0xcd, 0x43, 0x54, 0x91, 0x1a, 0x1f, 0x99, + 0x83, 0x21, 0xb2, 0xf2, 0x13, 0x3e, 0x23, 0x1b, 0x43, 0xd8, + 0x51, 0x51, 0xe2, 0x06, 0x10, 0xc8, 0xb9, 0x46, 0x8e, 0x20, + 0x50, 0xa2, 0x6a, 0x5e, 0x2d, 0xa7, 0xf1, 0xa5, 0x47, 0x6c, + 0x65, 0x2f, 0xd4, 0xc9, 0x76, 0xe1, 0xef, 0xd7, 0x53, 0x21, + 0x52, 0x28, 0xc8, 0x3d, 0x61, 0x77, 0x11, 0x01, 0x71, 0x3b, + 0xdc, 0xac, 0x7f, 0xf8, 0xb0, 0xc0, 0x69, 0xda, 0xb1, 0x61, + 0x94, 0x61, 0x37, 0x96, 0xc7, 0xc7, 0x5b, 0xdd, 0xfb, 0x7f, + 0x00, 0xe0, 0x1f, 0xb4, 0x80, 0x74, 0xf9, 0x30, 0x97, 0xc9, + 0x12, 0xc5, 0x55, 0x82, 0x9b, 0xbf, 0xf0, 0x1d, 0xc7, 0x93, + 0x5a, 0x32, 0xc6, 0xf5, 0x7c, 0xbd, 0xfe, 0x67, 0x8b, 0x68, + 0x3d, 0x70, 0xd8, 0x24, 0x71, 0xc2, 0x01, 0x2a, 0x44, 0x93, + 0x84, 0x9d, 0x91, 0x69, 0xb4, 0x6d, 0xa7, 0x7a, 0xab, 0x12, + 0xcf, 0xab, 0xaf, 0x7c, 0x7b, 0x75, 0x53, 0xc9, 0x14, 0xb0, + 0x04, 0x9f, 0x40, 0x72, 0x96, 0xf8, 0x95, 0xb4, 0xda, 0x3e, + 0x1b, 0xf7, 0x3c, 0x91, 0x8d, 0x18, 0x52, 0x62, 0x2a, 0x4f, + 0xf4, 0x55, 0x7b, 0x4d, 0x15, 0x59, 0x71, 0xed, 0xf4, 0x25, + 0xf8, 0x26, 0x3f, 0xc6, 0x8f, 0x5b, 0x1d, 0x96, 0x3a, 0x13, + 0x24, 0x08, 0x5f, 0x5b, 0x69, 0x27, 0x5a, 0xbb, 0xb0, 0x86, + 0xdc, 0x84, 0x9d, 0x4a, 0x6b, 0x41, 0xd3, 0x87, 0x57, 0x08, + 0x1f, 0x22, 0x32, 0xc1, 0x7f, 0x2f, 0x84, 0xcb, 0xba, 0x86, + 0x27, 0x87, 0xa6, 0xa4, 0x1a, 0x99, 0xb5, 0xa5, 0xa1, 0xc3, + 0xa9, 0xa4, 0x03, 0x45, 0xc8, 0x2a, 0x94, 0xdf, 0x5c, 0x3b, + 0xc0, 0x72, 0x85, 0xf3, 0xe0, 0x1f, 0xad, 0x44, 0x66, 0x85, + 0xdd, 0x7b, 0xf3, 0x3a, 0x59, 0xd7, 0x81, 0x27, 0x65, 0xcf, + 0xba, 0xf0, 0x74, 0xfd, 0xf5, 0xf5, 0x12, 0xce, 0x31, 0xdf, + 0xf1, 0xdf, 0x22, 0x93, 0x6d, 0x23, 0x7f, 0xa3, 0xa7, 0x54, + 0xa0, 0x05, 0x10, 0xe5, 0x2d, 0x49, 0x14, 0x8f, 0x99, 0xe1, + 0xf5, 0x8e, 0x51, 0xe0, 0xe3, 0xcf, 0x56, 0x2f, 0x6e, 0xae, + 0xa5, 0x91, 0x4e, 0xe2, 0x73, 0x4c, 0xea, 0xc2, 0x25, 0x6a, + 0x8c, 0x6a, 0xb6, 0x26, 0x88, 0x28, 0xb5 +}; + diff --git a/neureka/depthwise/src/output.c b/neureka/depthwise/src/output.c new file mode 100644 index 0000000..196105c --- /dev/null +++ b/neureka/depthwise/src/output.c @@ -0,0 +1,62 @@ +#include "output.h" + +#define OUTPUT_SIZE (335) +PI_L1 uint8_t output[OUTPUT_SIZE]; + +#define GOLDEN_OUTPUT_SIZE (335) +PI_L2 uint8_t golden_output[GOLDEN_OUTPUT_SIZE] = { + 0x26, 0xff, 0x00, 0x8c, 0x00, 0x92, 0x00, 0x54, 0x00, 0x5e, + 0x00, 0xcd, 0x04, 0x00, 0x00, 0xf8, 0x00, 0x7d, 0x7b, 0x00, + 0x00, 0x00, 0xb2, 0xc4, 0x00, 0x15, 0x00, 0xdb, 0x00, 0x9d, + 0xfe, 0x00, 0x00, 0x00, 0x00, 0xd9, 0x44, 0x00, 0xaf, 0x00, + 0x00, 0xb6, 0x4b, 0xfb, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x68, 0x69, 0x32, 0x00, 0x00, 0x90, 0x58, 0xff, 0x00, + 0xff, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x06, 0x00, 0xca, 0x00, + 0x00, 0x00, 0xd3, 0x00, 0xad, 0x0b, 0x00, 0x00, 0x0d, 0xb0, + 0x43, 0x00, 0x87, 0x00, 0xd7, 0x00, 0x3c, 0xff, 0x00, 0xff, + 0x00, 0x00, 0xea, 0x2f, 0x00, 0xc9, 0x10, 0x00, 0xd4, 0x08, + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x3b, 0x00, 0x00, + 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x99, 0xff, 0x77, + 0x35, 0x00, 0x00, 0xff, 0x4a, 0xff, 0x00, 0xa9, 0x00, 0x7f, + 0x00, 0x00, 0x00, 0x81, 0x00, 0xdc, 0x00, 0x00, 0x00, 0xd1, + 0x00, 0xa0, 0x7a, 0x00, 0x00, 0x18, 0xb0, 0x7d, 0x00, 0x3d, + 0x00, 0xd5, 0x00, 0x4c, 0xcc, 0x00, 0x51, 0x00, 0x00, 0xd9, + 0x42, 0x00, 0xce, 0x00, 0x00, 0xac, 0x86, 0xff, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0xd5, 0x00, 0x00, 0x18, 0x00, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x72, 0xff, 0x9d, 0x56, 0x00, 0x00, + 0xff, 0x2d, 0xff, 0x00, 0xff, 0x00, 0x89, 0x00, 0xef, 0x00, + 0xb6, 0x00, 0xc9, 0x00, 0x00, 0x00, 0xff, 0x00, 0x8a, 0x2d, + 0x13, 0x00, 0x38, 0xaf, 0xaf, 0x00, 0x46, 0x00, 0xd1, 0x00, + 0x88, 0xcc, 0x00, 0x32, 0x1e, 0x00, 0xe5, 0x37, 0x00, 0xf2, + 0x00, 0x00, 0x8c, 0x92, 0xfb, 0x70, 0x00, 0x08, 0x00, 0x00, + 0x00, 0x62, 0x00, 0x00, 0x68, 0x00, 0xff, 0x00, 0x00, 0x00, + 0x00, 0xe7, 0xfd, 0x78, 0x40, 0x00, 0x00, 0xe0, 0x56, 0xff, + 0x00, 0x7c, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x45, 0x00, 0xcb, + 0x05, 0x00, 0x00, 0xff, 0x00, 0xa3, 0x00, 0x00, 0x00, 0x29, + 0xb4, 0xb2, 0x00, 0x5d, 0x00, 0xcc, 0x00, 0xc2, 0xff, 0x00, + 0x00, 0x00, 0x00, 0xc2, 0x8f, 0x01, 0xab, 0x00, 0x00, 0x5e, + 0x3d, 0xff, 0x00, 0x00, 0x09, 0x0b, 0x00, 0x00, 0x71, 0x00, + 0x00, 0x30, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0xeb, 0xff, + 0x77, 0x70, 0x00, 0x00, 0xff +}; + +int check_output() { + printf("Checking the output vector:\n"); + + int n_err = 0; + for (int i = 0; i < OUTPUT_SIZE; i++) { + if (output[i] != golden_output[i]) { + printf("ERROR: wrong value of output @ %d: %d vs. golden: %d\n", i, output[i], golden_output[i]); + n_err++; + } + } + + if (n_err == 0) + printf("> Success! No errors found.\n"); + else + printf("> Failure! Found %d/%d errors.\n", n_err, OUTPUT_SIZE); + return n_err; + } + + \ No newline at end of file diff --git a/neureka/depthwise/src/scale.c b/neureka/depthwise/src/scale.c new file mode 100644 index 0000000..8ade6da --- /dev/null +++ b/neureka/depthwise/src/scale.c @@ -0,0 +1,13 @@ +#include "scale.h" + +#define SCALE_SIZE (67) +PI_L1 uint8_t scale[SCALE_SIZE] = { + 0x05, 0x1c, 0x06, 0x0c, 0x02, 0x04, 0x0b, 0x1b, 0x0e, 0x1b, + 0x15, 0x02, 0x04, 0x08, 0x12, 0x07, 0x03, 0x0e, 0x19, 0x1b, + 0x04, 0x07, 0x01, 0x1a, 0x03, 0x0f, 0x01, 0x02, 0x14, 0x0f, + 0x16, 0x1d, 0x18, 0x15, 0x1e, 0x05, 0x09, 0x12, 0x0f, 0x19, + 0x06, 0x0a, 0x0b, 0x05, 0x1b, 0x02, 0x06, 0x1d, 0x0d, 0x19, + 0x1b, 0x10, 0x15, 0x19, 0x02, 0x1c, 0x04, 0x12, 0x05, 0x09, + 0x1e, 0x1e, 0x09, 0x17, 0x09, 0x08, 0x1a +}; + diff --git a/neureka/depthwise/src/weight.c b/neureka/depthwise/src/weight.c new file mode 100644 index 0000000..b473481 --- /dev/null +++ b/neureka/depthwise/src/weight.c @@ -0,0 +1,93 @@ +#include "weight.h" + +#define WEIGHT_SIZE (864) +PI_L1 uint8_t weight[WEIGHT_SIZE] = { + 0xfa, 0x5e, 0x7d, 0x0e, 0x0d, 0x0d, 0x03, 0xc1, 0x56, 0x6a, + 0x95, 0x9e, 0x1e, 0xdf, 0x88, 0x4d, 0xbe, 0x88, 0xc4, 0x67, + 0x6e, 0x2e, 0x9e, 0x60, 0xcc, 0xb4, 0x25, 0x04, 0xe5, 0xfc, + 0x82, 0x6b, 0x1e, 0x34, 0x8a, 0x08, 0x47, 0xda, 0xe8, 0xeb, + 0x39, 0x54, 0x10, 0x81, 0x36, 0x76, 0x53, 0xa3, 0x87, 0x45, + 0x86, 0xd4, 0x9b, 0xb1, 0x58, 0xb8, 0xad, 0xa8, 0x47, 0x0a, + 0x0c, 0xde, 0x20, 0xc5, 0xae, 0xf5, 0xaa, 0xaf, 0x29, 0x92, + 0x48, 0x40, 0xd2, 0x79, 0x9d, 0x5c, 0xa5, 0x8e, 0x6d, 0x1a, + 0x0b, 0x12, 0xf4, 0x03, 0xa3, 0x1f, 0x45, 0x3c, 0x16, 0xd5, + 0xb6, 0xe1, 0x25, 0xce, 0x4a, 0x1d, 0xe4, 0xc8, 0xd6, 0xda, + 0x54, 0xfa, 0x6c, 0x63, 0x35, 0x4b, 0x0b, 0x3a, 0x84, 0xe5, + 0x8c, 0x89, 0x18, 0xb9, 0x94, 0x6d, 0x01, 0xf2, 0x5d, 0x39, + 0xc0, 0x2d, 0x4d, 0x97, 0x42, 0xaa, 0x49, 0x6b, 0x2b, 0xa0, + 0xd4, 0x4b, 0x06, 0xeb, 0x48, 0xab, 0xc5, 0x11, 0x2b, 0xf8, + 0xb9, 0xc8, 0xa7, 0xb0, 0xe5, 0xe8, 0x69, 0x5b, 0xe8, 0xaf, + 0xdf, 0xa2, 0x32, 0x3a, 0xc6, 0x20, 0x86, 0xc9, 0x2d, 0x81, + 0xa3, 0xd2, 0x07, 0xd2, 0x43, 0x24, 0xd8, 0x02, 0x0c, 0x37, + 0x48, 0x6f, 0x5f, 0x83, 0x47, 0x76, 0xd5, 0xf6, 0xe6, 0xb1, + 0xef, 0xcb, 0x5a, 0x39, 0xc0, 0x85, 0x0e, 0xa9, 0x30, 0x39, + 0x75, 0x01, 0x06, 0xc7, 0x29, 0x35, 0xa3, 0x98, 0x56, 0xd9, + 0x40, 0x54, 0xe8, 0x0d, 0x4f, 0xb7, 0x68, 0xfb, 0x57, 0x82, + 0x2f, 0x56, 0x91, 0x70, 0xe2, 0xab, 0xe7, 0xcb, 0x5a, 0x19, + 0xc0, 0xa5, 0x0f, 0xa9, 0xb0, 0x39, 0x75, 0x01, 0x06, 0xcf, + 0x2d, 0xb5, 0xa3, 0x98, 0x56, 0xd9, 0x49, 0x54, 0xe8, 0x0b, + 0x4c, 0x37, 0x68, 0xfb, 0x57, 0x83, 0x2f, 0x56, 0x91, 0x70, + 0xe2, 0xbb, 0x18, 0x34, 0xa5, 0xe6, 0x3f, 0x5a, 0xf0, 0x56, + 0xcf, 0xc6, 0x8a, 0xfe, 0xf9, 0x30, 0xd2, 0x4a, 0x5c, 0x67, + 0xa9, 0x26, 0xb6, 0xab, 0x17, 0xf4, 0xb3, 0xc8, 0x97, 0x04, + 0xa8, 0x7c, 0xd0, 0xa9, 0x6e, 0x8f, 0x1d, 0x44, 0xe0, 0x3e, + 0x07, 0xbc, 0x00, 0x64, 0x58, 0x14, 0x26, 0xc1, 0x52, 0x73, + 0x88, 0xf8, 0x22, 0xa9, 0x3c, 0x42, 0x0d, 0x4a, 0xec, 0xf8, + 0xc2, 0xd7, 0x29, 0x6d, 0x02, 0x04, 0xc4, 0x15, 0x40, 0x12, + 0xe1, 0xc4, 0x5c, 0x34, 0x8f, 0x46, 0x86, 0xbc, 0x93, 0xf4, + 0xeb, 0x2d, 0xcb, 0x67, 0xb6, 0x56, 0x32, 0x8c, 0xe0, 0x87, + 0x3e, 0x17, 0x06, 0xe6, 0x57, 0xf9, 0xfc, 0x7f, 0x6d, 0x3d, + 0x08, 0x83, 0x1c, 0x36, 0x47, 0x2b, 0x55, 0xf5, 0x89, 0x66, + 0xcf, 0xf5, 0x5a, 0x34, 0x95, 0x7b, 0x6f, 0xfa, 0x1a, 0x5b, + 0x1b, 0x05, 0x62, 0x8d, 0x0a, 0x21, 0x83, 0x83, 0x22, 0x3c, + 0x39, 0x47, 0x24, 0x17, 0x8f, 0xea, 0xce, 0x2d, 0xdf, 0x47, + 0x86, 0x49, 0x4f, 0xba, 0x0e, 0x2f, 0xa5, 0xa5, 0x1b, 0x3d, + 0xc6, 0x20, 0x4f, 0xf0, 0x86, 0x62, 0x47, 0xed, 0x22, 0x19, + 0x71, 0xad, 0x27, 0xaf, 0xe9, 0x56, 0x93, 0xf5, 0x5c, 0x47, + 0xff, 0xce, 0xbf, 0x2a, 0xa8, 0x95, 0x9a, 0xb5, 0x17, 0xc1, + 0xa6, 0x3b, 0x6e, 0xd9, 0x6a, 0xde, 0xf3, 0x7d, 0xd8, 0x33, + 0xca, 0x0d, 0xa9, 0xd2, 0x48, 0x38, 0x3b, 0x84, 0x48, 0x60, + 0xb7, 0x2c, 0x8a, 0x85, 0xb8, 0xcb, 0x01, 0xe5, 0xe5, 0x94, + 0xbf, 0xb4, 0x5b, 0x60, 0x98, 0xfb, 0xe6, 0xae, 0x45, 0xf1, + 0x6b, 0xbe, 0x65, 0x49, 0x54, 0x33, 0xd7, 0x32, 0x2f, 0xc0, + 0x5c, 0x2a, 0x31, 0xa7, 0x0c, 0xe6, 0xaf, 0x2a, 0x9f, 0x64, + 0x7c, 0x57, 0xab, 0xbb, 0xa7, 0x15, 0xbc, 0x92, 0x84, 0x2f, + 0x81, 0x87, 0xe3, 0x8f, 0x45, 0xf1, 0x6b, 0xbe, 0x65, 0x69, + 0x5c, 0x33, 0xc6, 0x32, 0x2f, 0xc0, 0x5c, 0x28, 0x39, 0xa5, + 0x0c, 0xe4, 0xaf, 0x2e, 0x9b, 0x64, 0x78, 0x47, 0xab, 0xbf, + 0xe7, 0x15, 0xbc, 0x94, 0xc0, 0x2f, 0x90, 0x87, 0xe3, 0x8f, + 0xba, 0x0e, 0x94, 0x41, 0x9a, 0x96, 0xa3, 0xcc, 0x39, 0xcd, + 0xd0, 0x3f, 0xa3, 0xd7, 0xc6, 0x5a, 0xf3, 0x1b, 0x50, 0xd1, + 0x64, 0x9b, 0x87, 0xb8, 0x54, 0x40, 0x18, 0xea, 0x43, 0x6b, + 0x3f, 0xd0, 0x6f, 0x78, 0x1c, 0x70, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00 +}; + diff --git a/neureka/idma_pointwise/Makefile b/neureka/idma_pointwise/Makefile new file mode 100644 index 0000000..41c2bc7 --- /dev/null +++ b/neureka/idma_pointwise/Makefile @@ -0,0 +1,5 @@ +APP_SRCS += main.c +OVERRIDE_MAIN = 1 +include ../app/Makefile + +STIM_DIR := ../pointwise/ \ No newline at end of file diff --git a/neureka/idma_pointwise/inc/bias.h b/neureka/idma_pointwise/inc/bias.h new file mode 100644 index 0000000..f28d0f5 --- /dev/null +++ b/neureka/idma_pointwise/inc/bias.h @@ -0,0 +1,9 @@ +#ifndef __BIAS_H__ +#define __BIAS_H__ + +#include + +#define BIAS_SIZE (39) +extern int32_t bias[BIAS_SIZE]; + +#endif // __BIAS_H__ diff --git a/neureka/idma_pointwise/inc/input.h b/neureka/idma_pointwise/inc/input.h new file mode 100644 index 0000000..985f9a9 --- /dev/null +++ b/neureka/idma_pointwise/inc/input.h @@ -0,0 +1,9 @@ +#ifndef __INPUT_H__ +#define __INPUT_H__ + +#include + +#define INPUT_SIZE (8995) +extern uint8_t input[INPUT_SIZE]; + +#endif // __INPUT_H__ diff --git a/neureka/idma_pointwise/inc/layer_conf.h b/neureka/idma_pointwise/inc/layer_conf.h new file mode 100644 index 0000000..e5e4812 --- /dev/null +++ b/neureka/idma_pointwise/inc/layer_conf.h @@ -0,0 +1,42 @@ +#ifndef __LAYER_CONF_H__ +#define __LAYER_CONF_H__ + +#define TEST_NAME "test" +#define INPUT_HEIGHT (7) +#define INPUT_WIDTH (5) +#define INPUT_CHANNEL (257) +#define INPUT_SIGNED (0) +#define INPUT_BITS (8) + +#define OUTPUT_HEIGHT (7) +#define OUTPUT_WIDTH (5) +#define OUTPUT_CHANNEL (39) +#define OUTPUT_BITS (8) + +#define WEIGHT_HEIGHT (1) +#define WEIGHT_WIDTH (1) +#define WEIGHT_CHANNEL_IN (257) +#define WEIGHT_CHANNEL_OUT (39) +#define WEIGHT_BITS (8) +#define WEIGHT_OFFSET (-128) + +#define SCALE_BITS (8) + +#define BIAS_BITS (32) + +#define PADDING_TOP (0) +#define PADDING_BOTTOM (0) +#define PADDING_LEFT (0) +#define PADDING_RIGHT (0) +#define PADDING_VALUE (0) + +#define STRIDE_HEIGHT (1) +#define STRIDE_WIDTH (1) + +#define GROUPS (1) +#define OUTSHIFT (12) +#define HAS_NORM_QUANT (1) +#define HAS_BIAS (1) +#define HAS_RELU (1) + +#endif // __LAYER_CONF_H__ diff --git a/neureka/idma_pointwise/inc/output.h b/neureka/idma_pointwise/inc/output.h new file mode 100644 index 0000000..68c6c11 --- /dev/null +++ b/neureka/idma_pointwise/inc/output.h @@ -0,0 +1,14 @@ +#ifndef __OUTPUT_H__ +#define __OUTPUT_H__ + +#include + +#define OUTPUT_SIZE (1365) +extern uint8_t output[OUTPUT_SIZE]; + +#define GOLDEN_OUTPUT_SIZE (1365) +extern uint8_t golden_output[GOLDEN_OUTPUT_SIZE]; + +int check_output(); + +#endif // __OUTPUT_H__ diff --git a/neureka/idma_pointwise/inc/scale.h b/neureka/idma_pointwise/inc/scale.h new file mode 100644 index 0000000..67dea26 --- /dev/null +++ b/neureka/idma_pointwise/inc/scale.h @@ -0,0 +1,9 @@ +#ifndef __SCALE_H__ +#define __SCALE_H__ + +#include + +#define SCALE_SIZE (39) +extern uint8_t scale[SCALE_SIZE]; + +#endif // __SCALE_H__ diff --git a/neureka/idma_pointwise/inc/weight.h b/neureka/idma_pointwise/inc/weight.h new file mode 100644 index 0000000..ba39fae --- /dev/null +++ b/neureka/idma_pointwise/inc/weight.h @@ -0,0 +1,9 @@ +#ifndef __WEIGHT_H__ +#define __WEIGHT_H__ + +#include + +#define WEIGHT_SIZE (11232) +extern uint8_t weight[WEIGHT_SIZE]; + +#endif // __WEIGHT_H__ diff --git a/neureka/idma_pointwise/main.c b/neureka/idma_pointwise/main.c new file mode 100644 index 0000000..dee3678 --- /dev/null +++ b/neureka/idma_pointwise/main.c @@ -0,0 +1,360 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "nnx_layer.h" +#include + +#include "neureka.h" +#include "neureka_gvsoc.h" +#include "neureka_testbench_bsp.h" +#include "neureka_task.h" +#include "pulp_nnx_neureka.h" + +#define NULL 0 + +typedef neureka_norm_mode_e nnx_norm_mode_e; +typedef neureka_quant_t nnx_quant_t; +typedef neureka_quant_function_e nnx_quant_function_e; +typedef neureka_norm_t nnx_norm_t; +typedef neureka_task_t nnx_task_t; +typedef neureka_dev_t nnx_dev_t; +typedef neureka_testbench_conf_t nnx_bsp_conf_t; +typedef neureka_task_flag_e nnx_task_flag_e; + +#define nnxTaskFlagTrue neurekaTaskFlagTrue +#define nnxTaskFlagFalse neurekaTaskFlagFalse + +#define nnx_task_init neureka_task_init +#define nnx_task_set_op_to_conv neureka_task_set_op_to_conv +#define nnx_task_set_bits neureka_task_set_bits +#define nnx_task_set_norm_quant neureka_task_set_norm_quant +#define nnx_task_set_weight_offset neureka_task_set_weight_offset +#define nnx_task_set_weight_source neureka_task_set_weight_source +#define nnx_task_set_activation_prefetch neureka_task_set_activation_prefetch +#define nnx_task_set_dims neureka_task_set_dims +#define nnx_task_set_ptrs_conv neureka_task_set_ptrs_conv +#define nnx_task_set_ptrs_norm_quant neureka_task_set_ptrs_norm_quant + +#define nnx_bsp_get_dev neureka_testbench_get_dev + +#define nnx_init neureka_nnx_init +#define nnx_dispatch_wait neureka_nnx_dispatch_wait +#define nnx_dispatch neureka_nnx_dispatch +#define nnx_resolve_wait neureka_nnx_resolve_wait +#define nnx_term neureka_nnx_term + +// Generated headers +#include "bias.h" +#include "input.h" +#include "layer_conf.h" +#include "output.h" +#include "scale.h" +#include "weight.h" + +static void task_prepare(nnx_task_t *task) { + nnx_task_init(task); + nnx_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1); + nnx_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS); + + nnx_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET); + +#ifdef NEUREKA_WEIGHT_SOURCE_WMEM + nnx_task_set_weight_source(task, neurekaWeightSourceWmem); + nnx_task_set_activation_prefetch(task, activationPrefetchOn); +#else + neureka_task_set_weight_source(task, neurekaWeightSourceTcdm); + nnx_task_set_activation_prefetch(task, activationPrefetchOff); +#endif +#if INPUT_SIGNED == 1 + neureka_task_set_input_signed(task); +#else + neureka_task_set_input_unsigned(task); +#endif + + const uint32_t w_in_stride = INPUT_CHANNEL * INPUT_BITS / 8; + const uint32_t h_in_stride = INPUT_WIDTH * w_in_stride; + const uint32_t w_out_stride = OUTPUT_CHANNEL * OUTPUT_BITS / 8; + const uint32_t h_out_stride = OUTPUT_WIDTH * w_out_stride; + +#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2 + nnx_task_set_dims_stride2x2( + task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride, + OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, w_out_stride, + WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM, PADDING_LEFT, + PADDING_RIGHT); +#else + nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride, + OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, + w_out_stride, PADDING_TOP, PADDING_BOTTOM, PADDING_LEFT, + PADDING_RIGHT); +#endif + + nnx_task_set_ptrs_conv(task, (uint32_t)input, INPUT_WIDTH, w_in_stride, + PADDING_TOP, PADDING_LEFT, (uint32_t)output, + (uint32_t)weight); +#if HAS_NORM_QUANT == 1 +#if SCALE_BITS == 8 + const nnx_norm_mode_e normMode = normMode8Bit; +#elif SCALE_BITS == 32 + const nnx_norm_mode_e normMode = normMode32Bit; +#endif + + const nnx_task_flag_e flag_bias = + HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse; + const uint32_t bias_ptr = (uint32_t)(HAS_BIAS ? bias : NULL); + + nnx_quant_function_e quant_function = + HAS_RELU ? quantFunctionRelu : quantFunctionIdentity; + + nnx_task_set_norm_quant(task, + (nnx_quant_t){.shift_amount = OUTSHIFT, + .function = quant_function, + .flag_rounding = nnxTaskFlagFalse}, + (nnx_norm_t){.mode = normMode, + .flag_bias = flag_bias, + .flag_shift = nnxTaskFlagFalse}); + + nnx_task_set_ptrs_norm_quant(task, (uint32_t)scale, NULL, bias_ptr); +#endif // HAS_NORM_QUANT +} + +static void task_execute(nnx_task_t *task) { + nnx_dev_t *dev = nnx_bsp_get_dev(); + + nnx_bsp_conf_t conf = {.max_stall = 8}; + nnx_init(dev, &conf); + + nnx_dispatch_wait(dev); + + // printf("CFG:\n"); + // for (int i=0; idata)[i]); + // } +#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2 + nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT, + OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, + WEIGHT_WIDTH); +#else + nnx_dispatch(dev, task); +#endif + + nnx_resolve_wait(dev, task); + + nnx_term(dev); + +} +#include +#include +#include +#include + +#include "layer_util.h" +#include "nnx_layer.h" +#include "output.h" +#include "input.h" +#include "weight.h" + + +#include +#include "pulp.h" + +#define VERBOSE_INIT_MEM +// #define VERBOSE_EXEC_IDMA +#define VERBOSE_TEST_MEM +#define VERBOSE_TEST_NEUREKA + +#define MAX_BUFFER_SIZE 0x2000 + +L2_DATA static unsigned char ext[MAX_BUFFER_SIZE]; +L1_DATA static unsigned char loc[MAX_BUFFER_SIZE]; + +#define EXT_DATA_ADDR ((unsigned int) ext) +#define TCDM_DATA_ADDR ((unsigned int) loc) +typedef enum {RX, TX} test_type_t; + +int test_idma(unsigned int len, test_type_t type, unsigned int ext_addr, unsigned int tcdm_addr); +void initialize_mem(unsigned int len, test_type_t type, unsigned int ext_addr, unsigned int tcdm_addr); +int test_mem(unsigned int len, test_type_t type, unsigned int ext_addr, unsigned int tcdm_addr); +int execute_idma(unsigned int len, test_type_t type, unsigned int ext_addr, unsigned int tcdm_addr); + +L1_DATA int global_error = 0; + +#define TX_SIZE (MAX_BUFFER_SIZE - 0x100) + +int main() { + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + int error_count = 0; + int id = 0; + + nnx_task_t task; + + + if(get_core_id()==0) + { + initialize_mem(TX_SIZE, TX, ext, loc); + } + + if(get_core_id()==1) + task_prepare(&task); + + synch_barrier(); + + if(get_core_id()==0) + { + // manually delay to force overlapped execution + for(volatile cnt=0; cnt<500; cnt++); + id = execute_idma(TX_SIZE, TX, ext, loc); + } + + if(get_core_id()==1) + task_execute(&task); + + synch_barrier(); + + if(get_core_id()==0) + error_count = test_mem(TX_SIZE, TX, ext, loc); + if(get_core_id()==1) + error_count = check_output(); + + synch_barrier(); + + if(get_core_id()==0) + global_error += error_count; + + synch_barrier(); + + if(get_core_id()==1) + global_error += error_count; + + synch_barrier(); + + return global_error; +} + +int test_idma_loop(){ + int error_count = 0; + + for ( int i = 5; i < 8045; i=5*i) { + error_count += test_idma(i, TX, ext, loc); + } + for ( int i = 5; i < 8045; i=5*i ) { + error_count += test_idma(i, RX, ext, loc); + } + return error_count; +} + +int test_idma(unsigned int len, test_type_t type, unsigned int ext_addr, unsigned int tcdm_addr){ + + int error_count = 0; + + initialize_mem(len, TX, ext, loc); + execute_idma(len, TX, ext, loc); + error_count = test_mem(len, TX, ext, loc); + return error_count; +} + +void initialize_mem(unsigned int len, test_type_t type, unsigned int ext_addr, unsigned int tcdm_addr){ + volatile unsigned int i; + if (type == RX){ + +#ifdef VERBOSE_INIT_MEM + printf ("INITIALIZING MEMORY FOR RX %d OPERATION: \n", len); +#endif + + for (i=0; i Success! No errors found.\n"); + else + printf("> Failure! Found %d/%d errors.\n", n_err, OUTPUT_SIZE); + return n_err; + } + + \ No newline at end of file diff --git a/neureka/idma_pointwise/src/scale.c b/neureka/idma_pointwise/src/scale.c new file mode 100644 index 0000000..5eb2c42 --- /dev/null +++ b/neureka/idma_pointwise/src/scale.c @@ -0,0 +1,10 @@ +#include "scale.h" + +#define SCALE_SIZE (39) +PI_L1 uint8_t scale[SCALE_SIZE] = { + 0x1e, 0x02, 0x1a, 0x1d, 0x15, 0x1b, 0x13, 0x07, 0x15, 0x0e, + 0x01, 0x11, 0x1b, 0x11, 0x0e, 0x11, 0x19, 0x13, 0x0d, 0x13, + 0x1e, 0x0f, 0x0b, 0x0b, 0x05, 0x1b, 0x04, 0x02, 0x0c, 0x14, + 0x0c, 0x04, 0x05, 0x0f, 0x07, 0x0f, 0x0f, 0x16, 0x1c +}; + diff --git a/neureka/idma_pointwise/src/weight.c b/neureka/idma_pointwise/src/weight.c new file mode 100644 index 0000000..140952b --- /dev/null +++ b/neureka/idma_pointwise/src/weight.c @@ -0,0 +1,1130 @@ +#include "weight.h" + +#define WEIGHT_SIZE (11232) +PI_L1 uint8_t weight[WEIGHT_SIZE] = { + 0x9f, 0x3b, 0x8a, 0x72, 0xf7, 0x43, 0x00, 0x69, 0xa1, 0xc9, + 0x09, 0x17, 0xa7, 0xab, 0xc5, 0xf7, 0xa7, 0xfa, 0x97, 0xa8, + 0xa5, 0x92, 0x11, 0xdf, 0xa7, 0x92, 0x91, 0xff, 0x58, 0x6d, + 0x6e, 0x00, 0x18, 0xb6, 0x1f, 0x16, 0x49, 0x91, 0x15, 0x60, + 0x06, 0x38, 0x2d, 0x55, 0x2d, 0xa3, 0x6e, 0x7e, 0xd2, 0x59, + 0x80, 0x79, 0x8f, 0x11, 0x4a, 0x7e, 0x8f, 0x11, 0x4a, 0x7e, + 0x70, 0xee, 0xb5, 0x81, 0x68, 0x39, 0xc7, 0xb5, 0x59, 0x5c, + 0xcd, 0x89, 0xfa, 0x75, 0x35, 0xdd, 0x0d, 0x89, 0x5c, 0xcd, + 0x9a, 0xee, 0x05, 0xba, 0x3d, 0xbc, 0x05, 0xca, 0x1d, 0xbc, + 0x05, 0xca, 0xe2, 0x43, 0xfa, 0x35, 0xee, 0xf7, 0xfa, 0x2d, + 0x88, 0xfb, 0x79, 0x5b, 0x2a, 0xc9, 0x15, 0x24, 0xe4, 0xbe, + 0x84, 0x12, 0x1d, 0x2d, 0x28, 0x42, 0xc6, 0xac, 0x00, 0xb2, + 0xc6, 0xac, 0x00, 0x12, 0x39, 0x53, 0xff, 0xed, 0xe0, 0x32, + 0xc6, 0x63, 0x0c, 0xea, 0x91, 0xf4, 0x00, 0x85, 0x05, 0x7f, + 0x6c, 0x88, 0x55, 0x3e, 0xfd, 0x98, 0x18, 0x1d, 0x6d, 0x9e, + 0x3b, 0x25, 0x6d, 0x9e, 0x3b, 0x25, 0x92, 0x61, 0xc4, 0xda, + 0x1e, 0x15, 0x6a, 0x82, 0x3d, 0x68, 0xeb, 0x2f, 0x6e, 0x76, + 0x1d, 0x53, 0xbb, 0x91, 0x91, 0xa7, 0x5c, 0x4a, 0xfa, 0x3a, + 0xc9, 0x01, 0x2a, 0x1b, 0xc9, 0x01, 0x6a, 0x1b, 0x36, 0xfe, + 0x95, 0xe4, 0x13, 0x69, 0xdd, 0xf5, 0xdc, 0xed, 0xb0, 0x39, + 0xdc, 0xda, 0xf6, 0x76, 0xaf, 0x02, 0xaa, 0xc4, 0x42, 0x3a, + 0xe9, 0x4f, 0x4a, 0x1b, 0xff, 0x4f, 0xca, 0x1a, 0xfb, 0x4f, + 0x35, 0xe5, 0x04, 0xb0, 0x2f, 0x58, 0xab, 0xfa, 0x4c, 0x98, + 0x7f, 0x23, 0x21, 0xa9, 0xfd, 0xce, 0x34, 0xf0, 0x3a, 0x67, + 0x5b, 0x89, 0x6f, 0x3b, 0x01, 0x88, 0x26, 0x3b, 0x01, 0x88, + 0x2e, 0xbb, 0xfe, 0x77, 0xd1, 0x44, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf7, 0xd1, + 0x59, 0xb4, 0x26, 0x04, 0x6e, 0x6b, 0xb0, 0x5c, 0x36, 0xaf, + 0xde, 0x83, 0x05, 0xfe, 0x12, 0xc3, 0xe5, 0x71, 0xf6, 0x8f, + 0x0c, 0xcd, 0xf6, 0x8f, 0x24, 0xcd, 0x09, 0x70, 0xdb, 0x32, + 0xb0, 0x48, 0x23, 0x45, 0xb7, 0x93, 0xae, 0xcc, 0x66, 0x56, + 0x86, 0x3f, 0xa5, 0xe8, 0x76, 0xbf, 0xcf, 0x08, 0xa1, 0xed, + 0xff, 0x08, 0x1d, 0xad, 0xff, 0x08, 0x15, 0xad, 0x00, 0xf7, + 0xea, 0x52, 0x78, 0x44, 0x14, 0xba, 0x06, 0x42, 0x66, 0xcb, + 0xc3, 0xe8, 0x9d, 0x75, 0x48, 0x9b, 0x03, 0x94, 0x7b, 0x80, + 0xff, 0x8a, 0xfb, 0x11, 0xce, 0x96, 0x5b, 0x11, 0xce, 0x92, + 0xa4, 0xee, 0x31, 0x6d, 0xe5, 0xe4, 0x8d, 0x49, 0x4d, 0xb4, + 0x11, 0x98, 0x0c, 0x8e, 0x94, 0xbd, 0x3e, 0xfc, 0xbc, 0x24, + 0xe0, 0x7c, 0xfa, 0x85, 0x95, 0xcc, 0x9d, 0x87, 0x94, 0x4c, + 0x9d, 0x87, 0x6b, 0xb3, 0x62, 0x78, 0x01, 0x54, 0xa5, 0xaa, + 0xd9, 0xdd, 0x03, 0xe6, 0xfc, 0x51, 0xce, 0x8e, 0xd4, 0x0c, + 0x6a, 0xe7, 0x4c, 0xf4, 0xfd, 0x13, 0x58, 0x74, 0x87, 0xb3, + 0x58, 0x74, 0x87, 0xbb, 0xa7, 0x8b, 0x78, 0x44, 0x8a, 0x0d, + 0x4a, 0xe6, 0x03, 0x6e, 0x2c, 0xfe, 0xb5, 0x8b, 0x96, 0xbd, + 0xc1, 0xdc, 0xf7, 0xb1, 0x51, 0xc7, 0x5f, 0x03, 0x51, 0xf7, + 0x7b, 0x95, 0x51, 0xe7, 0x7b, 0x95, 0xae, 0x18, 0x84, 0x6a, + 0x54, 0x66, 0x1d, 0xda, 0x22, 0x63, 0xdd, 0x34, 0x7a, 0x6a, + 0x2e, 0x56, 0x99, 0x82, 0x9a, 0xc7, 0xf2, 0x7e, 0x90, 0x5e, + 0x7a, 0x4e, 0xda, 0x76, 0xfa, 0x4e, 0xda, 0x76, 0x05, 0xb1, + 0x25, 0x89, 0x0d, 0xef, 0x72, 0xd8, 0x3c, 0xe2, 0x2a, 0x60, + 0x9d, 0xd5, 0xf0, 0x43, 0x2b, 0x50, 0x86, 0x30, 0xea, 0x59, + 0x9d, 0x45, 0xe8, 0x18, 0xd9, 0xe7, 0xe8, 0x18, 0xd9, 0xe7, + 0x17, 0xe7, 0x26, 0x18, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x4a, 0x80, 0xce, + 0xd0, 0x4e, 0x9b, 0x8a, 0xba, 0xa5, 0xd9, 0x53, 0x75, 0x4c, + 0x98, 0x23, 0xcc, 0xe4, 0xd7, 0x31, 0x3d, 0x00, 0xfd, 0x0b, + 0x7d, 0x00, 0xdd, 0x0b, 0x82, 0xff, 0x22, 0xf4, 0x75, 0x49, + 0x5f, 0xcc, 0xdb, 0xae, 0xde, 0xab, 0xdd, 0x30, 0x8b, 0x2c, + 0xa6, 0xc4, 0xb5, 0xf1, 0x97, 0x01, 0x7a, 0x23, 0xa3, 0x95, + 0x0e, 0x2d, 0xa3, 0x95, 0x2e, 0x2d, 0x5c, 0x6a, 0xd1, 0xd2, + 0x65, 0xf0, 0x53, 0x99, 0xf0, 0x91, 0xef, 0x32, 0x96, 0xf7, + 0x12, 0xc3, 0xe0, 0x17, 0x13, 0xac, 0x3c, 0x53, 0x2f, 0x1f, + 0x54, 0x77, 0x33, 0xde, 0x14, 0x77, 0x33, 0x9e, 0xeb, 0x88, + 0xcc, 0x61, 0x02, 0x02, 0xca, 0xed, 0xa2, 0xa4, 0xcf, 0x20, + 0xf7, 0x92, 0xdb, 0x88, 0x11, 0x86, 0xee, 0xcf, 0x9a, 0xe0, + 0xc9, 0x6f, 0x9b, 0x82, 0x88, 0xee, 0x9b, 0x82, 0xc8, 0xee, + 0x64, 0x7d, 0x37, 0x11, 0x46, 0x01, 0xa3, 0xbc, 0x0e, 0xc4, + 0x72, 0xd1, 0xc3, 0x85, 0xad, 0xcb, 0x11, 0x25, 0x56, 0x89, + 0x4e, 0x27, 0xcd, 0xcd, 0xa6, 0xa7, 0x6c, 0x7d, 0x86, 0x27, + 0xec, 0xfd, 0x79, 0xd8, 0x13, 0x02, 0xa0, 0x26, 0xfe, 0xe8, + 0xfc, 0xf5, 0x64, 0x2d, 0x59, 0xa7, 0x92, 0xd4, 0x5a, 0xd3, + 0xbe, 0x77, 0x08, 0x4e, 0x04, 0x86, 0x50, 0xce, 0x36, 0x84, + 0x50, 0xce, 0x36, 0x84, 0xaf, 0x31, 0xc9, 0x7b, 0xbb, 0x25, + 0x67, 0x70, 0x0c, 0x20, 0x92, 0x7f, 0x30, 0xc2, 0xd0, 0x88, + 0x27, 0xaa, 0x32, 0xaf, 0x65, 0xd0, 0x19, 0x4f, 0x83, 0x98, + 0x1e, 0xb3, 0x03, 0x98, 0x1e, 0xb3, 0xfc, 0x67, 0xe1, 0x4c, + 0x73, 0x1e, 0x1f, 0xd8, 0xb7, 0x16, 0x15, 0x8b, 0x96, 0xc5, + 0x15, 0xa3, 0xa5, 0x2b, 0xae, 0x7a, 0x30, 0xb5, 0x0f, 0x89, + 0x2f, 0x92, 0x37, 0xa8, 0x27, 0x92, 0x17, 0xa8, 0xd8, 0x6d, + 0xe8, 0x57, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x4b, 0x8a, 0x05, 0xdc, 0x97, 0x61, + 0xcf, 0x05, 0xb4, 0x98, 0x55, 0x98, 0xc4, 0xc5, 0xd8, 0xb1, + 0x28, 0x49, 0x74, 0x9d, 0x89, 0x8b, 0x74, 0xbd, 0x89, 0xcb, + 0x74, 0xbd, 0x76, 0x34, 0x8b, 0x42, 0xe3, 0x67, 0x8e, 0x28, + 0x5e, 0x2e, 0x32, 0x1f, 0xdf, 0x9e, 0xa1, 0x1f, 0xcf, 0x0f, + 0x1b, 0x16, 0xaa, 0x3e, 0x33, 0x31, 0x83, 0x37, 0xb1, 0x75, + 0x83, 0x36, 0xb1, 0x75, 0x7c, 0xc9, 0x4e, 0x8a, 0xe4, 0x3f, + 0xc5, 0x8b, 0xd2, 0x10, 0xf2, 0xc2, 0x99, 0x4f, 0x15, 0xe1, + 0x72, 0xfc, 0x83, 0xf4, 0x79, 0x03, 0x05, 0x83, 0x79, 0x0f, + 0xc7, 0x80, 0x79, 0x0f, 0xc7, 0x80, 0x86, 0xf0, 0x38, 0x7f, + 0x91, 0x81, 0xee, 0x27, 0xf3, 0x13, 0xbd, 0x47, 0x55, 0xbf, + 0xe1, 0x5e, 0xa4, 0x47, 0xed, 0xf3, 0x1b, 0x7b, 0x41, 0x5b, + 0x26, 0x5b, 0xe5, 0x7b, 0x26, 0x5b, 0x65, 0x7b, 0xd9, 0xa4, + 0x9a, 0x84, 0x3b, 0x70, 0x1e, 0x38, 0x31, 0x21, 0x9c, 0x48, + 0xe7, 0x9c, 0x1b, 0x8d, 0x67, 0xc9, 0x9c, 0x89, 0x26, 0x1a, + 0xc3, 0x2f, 0x27, 0xd8, 0xf3, 0xbb, 0x27, 0x98, 0xb3, 0xab, + 0xd8, 0x67, 0x4c, 0x54, 0x48, 0xbd, 0xf9, 0x4f, 0x2c, 0x37, + 0xc9, 0xdd, 0x4d, 0xef, 0x59, 0x09, 0x73, 0x9d, 0xb8, 0x36, + 0x9b, 0x06, 0x11, 0xc0, 0xb3, 0x16, 0x5d, 0x09, 0xb3, 0x16, + 0x59, 0x09, 0x4c, 0xe9, 0xa6, 0xf6, 0x25, 0xf5, 0xf8, 0x08, + 0x9b, 0x75, 0x87, 0x32, 0x0a, 0xd0, 0x64, 0x04, 0xbb, 0x9d, + 0x5c, 0x30, 0x75, 0xa3, 0x9e, 0x55, 0xb7, 0x94, 0xcd, 0x01, + 0xb7, 0x97, 0xcd, 0x01, 0x48, 0x68, 0x32, 0xfe, 0x4f, 0x22, + 0x33, 0xd1, 0x46, 0x2a, 0x5d, 0xc7, 0x69, 0x59, 0xf0, 0xc2, + 0x1e, 0xc6, 0xb6, 0xca, 0xa0, 0x1f, 0xd6, 0x1f, 0x2b, 0xc6, + 0xdc, 0xfb, 0x29, 0xc6, 0xd4, 0xfb, 0xd6, 0x39, 0x2b, 0x04, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x78, 0xdb, 0x23, 0x5a, 0xc8, 0xac, 0xb1, 0x69, + 0x2d, 0x8a, 0xc1, 0x6a, 0x7b, 0x9c, 0x59, 0x1c, 0xc6, 0x09, + 0x3e, 0x88, 0xfe, 0x00, 0x5b, 0x68, 0xfe, 0x00, 0x5b, 0x68, + 0x01, 0xff, 0xa4, 0x97, 0x62, 0xd2, 0xf9, 0xdf, 0x04, 0x54, + 0x66, 0xcf, 0x69, 0x13, 0x3e, 0x79, 0xc4, 0xa3, 0x28, 0x1c, + 0x81, 0xb4, 0x86, 0xa5, 0xc4, 0xa4, 0xa8, 0x11, 0xc4, 0xa4, + 0xa0, 0x11, 0x3b, 0x5b, 0x5f, 0xee, 0xbe, 0x03, 0x85, 0x09, + 0x4c, 0x70, 0xa5, 0x8a, 0x8a, 0xe9, 0x64, 0x34, 0x4d, 0x6b, + 0x08, 0xe0, 0x6f, 0x60, 0xdc, 0x8c, 0x4f, 0x72, 0xc8, 0xa4, + 0x4f, 0x70, 0xc8, 0xac, 0xb0, 0x8f, 0x37, 0x53, 0xd0, 0xd6, + 0xbe, 0x82, 0x0b, 0x60, 0xbc, 0x0b, 0xea, 0xc9, 0x00, 0xb8, + 0x4d, 0xf7, 0xc5, 0x2b, 0x43, 0xc2, 0xcd, 0x2c, 0x4b, 0x4a, + 0x85, 0x16, 0x4b, 0xc8, 0xc5, 0x16, 0xb4, 0x37, 0x3a, 0xe9, + 0x1f, 0x07, 0x35, 0x29, 0x54, 0x16, 0x57, 0xfc, 0xac, 0x94, + 0x3b, 0xf2, 0xb7, 0x54, 0xa6, 0x89, 0xc4, 0xa5, 0x94, 0x63, + 0x85, 0x87, 0xe3, 0xe7, 0x85, 0x87, 0xa2, 0xe3, 0x7a, 0x78, + 0x5d, 0x1c, 0x11, 0xa7, 0xf8, 0x78, 0x8c, 0x11, 0xa1, 0xe8, + 0x75, 0xbf, 0x6e, 0x3f, 0x05, 0xe7, 0x2d, 0xae, 0x9e, 0xed, + 0x9a, 0x32, 0xd1, 0xef, 0xdb, 0x22, 0xd1, 0xef, 0x9b, 0x22, + 0x2e, 0x10, 0x64, 0xdd, 0x00, 0xb6, 0x03, 0x5d, 0x88, 0x5b, + 0x8f, 0xc7, 0x71, 0x40, 0x83, 0x28, 0x6f, 0x1d, 0xf3, 0xf3, + 0x7c, 0x40, 0xd4, 0xe9, 0x4c, 0x51, 0xc8, 0xe3, 0x4c, 0x51, + 0xc4, 0xe3, 0xb3, 0xae, 0x3b, 0x1c, 0xa8, 0x94, 0x08, 0x8f, + 0x88, 0x6d, 0x96, 0x3d, 0xcd, 0x50, 0xa6, 0xf4, 0x58, 0x41, + 0xd7, 0xa1, 0x58, 0x3a, 0x2d, 0x4e, 0x74, 0x78, 0x2f, 0xcc, + 0x74, 0x78, 0x2f, 0xcc, 0x8b, 0x87, 0xd0, 0x33, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0xca, 0xd3, 0x0c, 0x83, 0x34, 0xb6, 0x8c, 0x65, 0xb7, 0x0f, + 0x20, 0x7d, 0xf7, 0xaa, 0x57, 0xc3, 0x1b, 0x4a, 0xb4, 0xcf, + 0x6b, 0x2a, 0xb4, 0x76, 0x7b, 0x2a, 0xb4, 0x76, 0x84, 0xd5, + 0x4b, 0x89, 0x47, 0xe6, 0x1e, 0xd2, 0x99, 0x2c, 0x18, 0xf7, + 0x80, 0x02, 0x61, 0x8a, 0x12, 0xfc, 0x89, 0xa3, 0x99, 0x93, + 0x64, 0xec, 0x8d, 0xb2, 0xe1, 0xf7, 0x89, 0xb3, 0xe1, 0xf7, + 0x76, 0x4c, 0x1e, 0x08, 0x1f, 0xff, 0x88, 0x4b, 0x72, 0x31, + 0x0e, 0x14, 0xd1, 0x68, 0x91, 0x9d, 0x2d, 0x90, 0xa8, 0x8d, + 0xc1, 0x85, 0xf8, 0x6e, 0x14, 0x80, 0xb2, 0x6d, 0x04, 0x80, + 0xb8, 0x6d, 0xfb, 0x7f, 0x47, 0x92, 0x6c, 0x25, 0xfe, 0xbb, + 0x22, 0x74, 0xac, 0x32, 0x08, 0xd0, 0xb9, 0x98, 0x8a, 0x2c, + 0xc9, 0xba, 0xc4, 0x03, 0x70, 0x24, 0xcc, 0x07, 0xbd, 0x38, + 0xcc, 0x07, 0xbc, 0x38, 0x33, 0xf8, 0x43, 0xc7, 0x03, 0x8f, + 0x12, 0x85, 0x7e, 0xc2, 0xa2, 0x58, 0x2e, 0xdf, 0xee, 0xd9, + 0x5a, 0x4e, 0x35, 0x0f, 0xdc, 0x19, 0x68, 0xc3, 0x56, 0x9a, + 0x01, 0x5b, 0x5e, 0x1a, 0x41, 0x5b, 0xa1, 0xe5, 0xbe, 0xa4, + 0x61, 0x55, 0xb5, 0x36, 0x7c, 0xaf, 0x79, 0x28, 0xee, 0x65, + 0x4c, 0x17, 0x29, 0xa6, 0x9a, 0xe1, 0xdb, 0xd6, 0xfa, 0x83, + 0x5f, 0xe8, 0xac, 0x29, 0x5d, 0xc6, 0xa8, 0x29, 0xa2, 0x39, + 0x57, 0xd6, 0xf4, 0xa1, 0x69, 0x75, 0x8b, 0x66, 0x66, 0xbd, + 0xe9, 0xd5, 0x3f, 0xda, 0x69, 0x1e, 0x0a, 0x95, 0x47, 0xe9, + 0x00, 0x5b, 0xa3, 0xe0, 0x79, 0x0e, 0xa3, 0xe8, 0x58, 0x1a, + 0x5c, 0x17, 0xa7, 0xe5, 0xc2, 0x17, 0xe0, 0xda, 0x9e, 0x76, + 0x68, 0x43, 0x21, 0xd1, 0xff, 0x6c, 0x24, 0x85, 0x79, 0x50, + 0x75, 0xf4, 0x02, 0xcb, 0xe5, 0xfc, 0x44, 0xe7, 0x65, 0xf4, + 0x40, 0xee, 0x9a, 0x0b, 0xbf, 0x11, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x5f, + 0x4b, 0x87, 0xac, 0x0d, 0xca, 0xed, 0x8e, 0x1c, 0x34, 0xe7, + 0xaa, 0x88, 0xa2, 0x36, 0x5c, 0x47, 0xa8, 0xd2, 0x00, 0x04, + 0xeb, 0x96, 0x04, 0x04, 0xea, 0x96, 0xfb, 0xfb, 0x15, 0x69, + 0x0c, 0x78, 0xa1, 0x71, 0x29, 0x4b, 0x5e, 0x7f, 0x2a, 0xdd, + 0x8e, 0x99, 0xc7, 0x3a, 0x91, 0x94, 0xee, 0x9b, 0x1d, 0x25, + 0xbe, 0x11, 0x15, 0x00, 0xbe, 0x19, 0x1d, 0x00, 0x41, 0xe6, + 0xe2, 0xff, 0x88, 0xdb, 0x6b, 0x02, 0x47, 0xe6, 0x35, 0xe8, + 0xa7, 0x9b, 0x5b, 0x99, 0xc6, 0xb8, 0xd7, 0xa2, 0x08, 0x03, + 0x76, 0x40, 0xe0, 0x97, 0xfb, 0xba, 0xc0, 0x97, 0xfb, 0xaa, + 0x3f, 0x68, 0x04, 0x55, 0x4a, 0x42, 0x8e, 0xa3, 0x33, 0x66, + 0xa3, 0x39, 0x4e, 0xc0, 0xf9, 0x7e, 0x05, 0x86, 0x00, 0x34, + 0x8b, 0x7f, 0x5e, 0x3b, 0x80, 0x6f, 0x5c, 0xbb, 0x81, 0x6f, + 0x5c, 0xbb, 0x7e, 0x90, 0xa3, 0x44, 0xea, 0xb7, 0xac, 0x08, + 0x62, 0x5d, 0x4a, 0x25, 0x38, 0xd8, 0xef, 0xb8, 0x15, 0xa7, + 0xe9, 0xda, 0x80, 0xcf, 0x71, 0x23, 0x16, 0xa6, 0x3b, 0x73, + 0x14, 0xae, 0x7b, 0x73, 0xeb, 0x51, 0x84, 0x8c, 0x7a, 0x61, + 0x8d, 0x92, 0x48, 0x9f, 0xd7, 0x0e, 0x62, 0x8e, 0x5a, 0x58, + 0x47, 0x60, 0x35, 0x9c, 0x5a, 0x6e, 0x10, 0x2f, 0x0a, 0x19, + 0x1c, 0x3f, 0x0a, 0x19, 0x1c, 0x3f, 0xf5, 0xe6, 0xe3, 0xc0, + 0x07, 0x93, 0x9e, 0x91, 0x7b, 0x6d, 0x0a, 0x4a, 0x2a, 0x21, + 0xdb, 0xfb, 0x40, 0x1c, 0x96, 0xff, 0xa1, 0x8e, 0x14, 0xde, + 0x12, 0xff, 0x30, 0x7e, 0x02, 0xff, 0x12, 0xfe, 0xfd, 0x00, + 0xed, 0x01, 0x42, 0x11, 0x91, 0xaa, 0xfe, 0x89, 0x1f, 0x1a, + 0x9b, 0xb7, 0xb2, 0xf6, 0xef, 0x73, 0x92, 0x49, 0x8d, 0x84, + 0xcc, 0x21, 0x98, 0x86, 0x98, 0x06, 0x98, 0x86, 0x98, 0x02, + 0x67, 0x79, 0x67, 0xfd, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x38, 0xcd, 0x21, 0x7b, + 0x6a, 0xaf, 0x7c, 0xf0, 0xc4, 0x58, 0x07, 0x7f, 0x2d, 0xfb, + 0xdb, 0xfe, 0xa7, 0x81, 0x9d, 0x89, 0x8d, 0xd9, 0xd9, 0xca, + 0xad, 0xd9, 0xd9, 0xca, 0x52, 0x26, 0x26, 0x35, 0x26, 0xe5, + 0xf5, 0xfe, 0x3b, 0xa4, 0x71, 0x1b, 0x44, 0xcb, 0xfd, 0xb6, + 0x9a, 0xf3, 0x27, 0x22, 0xc8, 0xfa, 0x78, 0x6c, 0xe9, 0xfa, + 0x6e, 0x74, 0xe8, 0xfa, 0x6e, 0x74, 0x17, 0x05, 0x91, 0x8b, + 0x93, 0x49, 0xa2, 0x64, 0xfc, 0x78, 0x4b, 0xb5, 0x20, 0x51, + 0x90, 0xb5, 0x4e, 0x65, 0x9e, 0xa3, 0x28, 0xa0, 0x04, 0xba, + 0x40, 0xa6, 0x48, 0x39, 0x40, 0xa6, 0x48, 0x39, 0xbf, 0x59, + 0xb7, 0xc6, 0x2f, 0xf4, 0xdf, 0x7b, 0x46, 0x8f, 0x30, 0x10, + 0x65, 0x59, 0x79, 0x8e, 0xc7, 0x96, 0xe4, 0xed, 0x3f, 0x2b, + 0x6d, 0x9f, 0xae, 0xad, 0xe9, 0x99, 0xae, 0xaf, 0x69, 0x9d, + 0x51, 0x50, 0x96, 0x62, 0xb9, 0x23, 0x8e, 0xa3, 0x0b, 0x15, + 0x87, 0x3a, 0x9b, 0x56, 0x2d, 0x71, 0x0e, 0x35, 0xb6, 0xea, + 0x8d, 0xce, 0xa1, 0x29, 0x8e, 0x2a, 0xfa, 0x0b, 0x8e, 0x0a, + 0xea, 0x09, 0x71, 0xf5, 0x15, 0xf6, 0x61, 0x8e, 0xce, 0xa5, + 0x47, 0xce, 0x20, 0x70, 0x3b, 0xb3, 0x8b, 0xa1, 0x0b, 0xbe, + 0x46, 0x5f, 0x2a, 0x45, 0xdc, 0x1b, 0x8b, 0x9a, 0x10, 0xbf, + 0x8b, 0x9a, 0x90, 0xbf, 0x74, 0x65, 0x6f, 0x40, 0xfd, 0x99, + 0x2b, 0x2c, 0x34, 0xd2, 0x98, 0x6c, 0xa2, 0x96, 0x7f, 0xeb, + 0x01, 0x7f, 0xce, 0xf3, 0x57, 0xde, 0x3a, 0xbc, 0x73, 0xbe, + 0x1d, 0xd9, 0x73, 0xbe, 0x1f, 0xd9, 0x8c, 0x41, 0xe0, 0x26, + 0xce, 0xa8, 0xec, 0x37, 0xd6, 0x13, 0x47, 0xb2, 0x58, 0xbd, + 0x88, 0xa5, 0x22, 0xbb, 0x85, 0x61, 0xaf, 0xbd, 0x6d, 0x5a, + 0x32, 0x89, 0xed, 0x6b, 0x3a, 0x99, 0xed, 0x6b, 0xc5, 0x66, + 0x12, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x7c, 0x06, 0x63, 0x8a, 0x5e, 0x18, + 0xbb, 0x88, 0x3a, 0x7a, 0xf9, 0xd3, 0x4d, 0x85, 0xfe, 0x2a, + 0x19, 0x8f, 0x50, 0xc3, 0xcd, 0x23, 0x9f, 0xcc, 0xcd, 0x2b, + 0xdf, 0xce, 0x32, 0xd4, 0x20, 0x31, 0x81, 0x32, 0xe7, 0x3e, + 0x93, 0x3a, 0x17, 0x89, 0x21, 0x48, 0xf5, 0xd7, 0x60, 0x44, + 0x37, 0x04, 0x78, 0x53, 0xb3, 0xed, 0xb0, 0xcc, 0x3e, 0xad, + 0xb0, 0xc4, 0x36, 0xad, 0x4f, 0x3b, 0xc9, 0x52, 0x3a, 0x01, + 0x6b, 0xa2, 0xf2, 0x11, 0x50, 0x3d, 0x30, 0xb4, 0x5e, 0xfe, + 0x87, 0x19, 0x58, 0x7c, 0x7d, 0x90, 0xa4, 0x7a, 0xb9, 0x3c, + 0xb8, 0xf6, 0xb9, 0x3c, 0xb8, 0xfe, 0x46, 0xc3, 0x47, 0x01, + 0xbb, 0xb5, 0xfc, 0x72, 0xc6, 0x36, 0x7c, 0xbd, 0x08, 0x16, + 0xf8, 0x39, 0x0e, 0xcf, 0xed, 0xa8, 0x1a, 0x8f, 0xd0, 0x19, + 0x53, 0xe7, 0x55, 0x39, 0x13, 0xe7, 0x55, 0x39, 0xec, 0x18, + 0xaa, 0xc6, 0xc2, 0x10, 0x66, 0xbf, 0x83, 0xbf, 0x7a, 0x46, + 0x84, 0xe2, 0x90, 0xf9, 0x16, 0xf6, 0x0d, 0x2e, 0x1c, 0x8f, + 0xb0, 0x81, 0x08, 0x96, 0xf4, 0x1b, 0x18, 0x96, 0xf4, 0x1b, + 0xe7, 0x69, 0x0b, 0xe4, 0x7f, 0x88, 0xfd, 0xa9, 0xaa, 0x9c, + 0xb8, 0x31, 0x91, 0xf0, 0xe1, 0xa6, 0xe2, 0x4e, 0x8d, 0x99, + 0x15, 0xbe, 0x00, 0x0c, 0x79, 0xfa, 0x69, 0x97, 0x79, 0xfe, + 0x49, 0x95, 0x86, 0x01, 0xb6, 0x6a, 0xc8, 0xde, 0x64, 0x2c, + 0x07, 0x35, 0xdf, 0x26, 0x62, 0x1f, 0xa3, 0xc3, 0x95, 0xfc, + 0x2d, 0x04, 0x46, 0x90, 0x31, 0xbd, 0x25, 0x7d, 0x73, 0x0c, + 0x25, 0x7c, 0x73, 0x0c, 0xda, 0x83, 0x8c, 0xf3, 0xfe, 0x6b, + 0xd7, 0x46, 0x86, 0xf6, 0x0b, 0x5e, 0x55, 0xde, 0x4c, 0x5d, + 0x67, 0xec, 0x0f, 0x6c, 0x7c, 0x97, 0x6a, 0x2a, 0xe6, 0x8f, + 0x2a, 0xa3, 0x66, 0xdf, 0x2a, 0xab, 0x99, 0x20, 0xd5, 0x54, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x12, 0x00, 0x77, 0x8b, 0xee, 0x27, 0x42, 0x81, + 0x7c, 0x43, 0x31, 0xf3, 0x45, 0x89, 0xed, 0x09, 0x18, 0x79, + 0xda, 0x83, 0xce, 0xf8, 0x4f, 0x03, 0xce, 0xf8, 0xdb, 0x03, + 0x31, 0x07, 0x24, 0xfc, 0x84, 0xb3, 0x04, 0x7b, 0x22, 0xf6, + 0xa6, 0x45, 0x8a, 0x6a, 0xcb, 0x6f, 0x5e, 0x6b, 0x31, 0x55, + 0xe7, 0x2d, 0x17, 0x2a, 0xf4, 0x03, 0x33, 0x3b, 0xf6, 0x43, + 0x33, 0x3b, 0x09, 0xbc, 0xcc, 0xc4, 0x16, 0x4c, 0x18, 0xd8, + 0xc8, 0xa2, 0xa8, 0x45, 0xed, 0xd9, 0xc8, 0x6a, 0xe1, 0x39, + 0x8c, 0x26, 0x2c, 0x21, 0x8e, 0x26, 0x40, 0xf1, 0x86, 0x04, + 0x78, 0x71, 0x8e, 0x26, 0x87, 0x8e, 0x71, 0xd9, 0x61, 0x1e, + 0xb0, 0x82, 0x4b, 0x54, 0xb5, 0xde, 0x8d, 0xba, 0x26, 0xc7, + 0xbd, 0x54, 0xc7, 0x6f, 0x2c, 0x1f, 0x50, 0x38, 0x18, 0x5d, + 0x40, 0x3e, 0x18, 0x5d, 0x40, 0x3e, 0xe7, 0xa2, 0xbf, 0xc1, + 0x2d, 0x17, 0x5a, 0xb0, 0xa9, 0xdd, 0x60, 0x10, 0xe9, 0xd3, + 0x15, 0x8a, 0x7d, 0x94, 0x20, 0xf1, 0x8e, 0x0c, 0xdb, 0x48, + 0xef, 0x4b, 0xab, 0x9e, 0xef, 0x4b, 0xab, 0xde, 0x10, 0xb4, + 0x54, 0x21, 0x2c, 0x3e, 0x3a, 0x45, 0x53, 0x0e, 0x7f, 0x2f, + 0x44, 0x31, 0x55, 0x48, 0xec, 0x31, 0x2b, 0x38, 0xb4, 0xa0, + 0xe1, 0x0a, 0xbc, 0x0b, 0x61, 0x2b, 0xbc, 0x03, 0x61, 0x6b, + 0x43, 0xfc, 0x9e, 0xd4, 0x6c, 0x85, 0xa1, 0x59, 0x6c, 0x25, + 0x95, 0x9c, 0x63, 0xd9, 0xbc, 0x35, 0x18, 0x03, 0x89, 0x6b, + 0xc7, 0xd2, 0xf5, 0xee, 0xe3, 0x71, 0xc5, 0xaa, 0xe3, 0x51, + 0xc5, 0xea, 0x1c, 0xae, 0x3a, 0x15, 0x05, 0x1b, 0x2c, 0xd0, + 0x30, 0xc2, 0x94, 0x31, 0x54, 0xc8, 0x8b, 0x8f, 0xde, 0x62, + 0x1d, 0x58, 0x99, 0x77, 0x2c, 0xc9, 0x1c, 0xf3, 0x34, 0x4d, + 0x1c, 0xf3, 0x3c, 0x4d, 0xe3, 0x0c, 0xc3, 0xb2, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x14, 0x2e, 0x50, 0x55, 0x21, 0x4f, 0xb4, 0x93, 0x75, 0x95, + 0x3f, 0x2f, 0x6e, 0x70, 0x02, 0x44, 0x9e, 0xd4, 0xee, 0x10, + 0xeb, 0x75, 0x42, 0x4c, 0xeb, 0x75, 0x42, 0x44, 0x14, 0x8a, + 0xbd, 0xbb, 0xef, 0x2d, 0xb8, 0x77, 0x9f, 0x2c, 0x1d, 0x29, + 0xe0, 0x05, 0x9c, 0x5a, 0xcf, 0xf6, 0xf6, 0x79, 0x4b, 0x06, + 0xac, 0xef, 0xaa, 0x94, 0xac, 0xed, 0xca, 0x14, 0xac, 0xef, + 0x35, 0xeb, 0x53, 0x10, 0xb9, 0x22, 0x21, 0xcc, 0x95, 0x41, + 0x36, 0xec, 0x7c, 0x83, 0x66, 0xe0, 0x36, 0x48, 0x65, 0xe4, + 0x7a, 0xf2, 0xe2, 0x56, 0xb6, 0xf0, 0xe0, 0xee, 0xb6, 0xf0, + 0xe0, 0xee, 0x49, 0x0f, 0x1f, 0x11, 0x7d, 0x76, 0xd8, 0xa0, + 0xc1, 0x10, 0xb9, 0x4c, 0xcd, 0xb2, 0xa4, 0x27, 0x53, 0x97, + 0xe3, 0x7e, 0xe0, 0xfb, 0x95, 0x97, 0x61, 0x6a, 0x96, 0x1b, + 0x61, 0x7b, 0x96, 0x9b, 0x9e, 0x84, 0x69, 0x64, 0x3e, 0x37, + 0xa1, 0xb8, 0x3a, 0x09, 0xee, 0x34, 0x2a, 0xd8, 0x7a, 0x51, + 0x8b, 0x6c, 0xe8, 0x7d, 0x9c, 0x51, 0x8f, 0xc3, 0x90, 0x1d, + 0xbe, 0xc4, 0x90, 0x19, 0xbf, 0xc4, 0x6f, 0xe6, 0x40, 0x3b, + 0x13, 0xa1, 0x45, 0x1b, 0x0b, 0x8b, 0x22, 0xec, 0x34, 0x25, + 0x8e, 0x44, 0x8c, 0x9b, 0x72, 0x12, 0xb8, 0x8b, 0x5f, 0x54, + 0xb0, 0x70, 0x1a, 0x74, 0xb0, 0xba, 0x1a, 0x14, 0x4f, 0x45, + 0xe5, 0xeb, 0x52, 0xdc, 0x5b, 0x77, 0xdb, 0xa7, 0xac, 0x22, + 0x15, 0xf9, 0xd9, 0xa4, 0xc9, 0x6a, 0xf6, 0x9f, 0xb3, 0x62, + 0xd4, 0x57, 0xb1, 0x6a, 0xa9, 0xc5, 0xb1, 0x6a, 0xb9, 0xc5, + 0x4e, 0x95, 0x46, 0x3a, 0x82, 0xa5, 0x81, 0x62, 0x47, 0xa5, + 0xa6, 0xca, 0x90, 0x67, 0xee, 0x22, 0x57, 0x4d, 0x11, 0x48, + 0xe9, 0x55, 0x2d, 0xf7, 0x7a, 0x17, 0x29, 0x8e, 0x7a, 0x55, + 0x29, 0xce, 0x85, 0xaa, 0xd6, 0x31, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0xea, + 0x42, 0x9b, 0x87, 0x51, 0xd4, 0xb1, 0x0d, 0xa3, 0x13, 0x96, + 0xed, 0xf5, 0x90, 0xce, 0x49, 0x6b, 0xd9, 0xb8, 0x19, 0x62, + 0x70, 0xb2, 0x19, 0x63, 0x50, 0xb2, 0xe6, 0x9c, 0xaf, 0x4d, + 0x25, 0x1a, 0x7e, 0x0d, 0x88, 0x4b, 0x2e, 0xb3, 0xea, 0x33, + 0x81, 0xe3, 0xb2, 0x90, 0xcb, 0x97, 0xd5, 0x33, 0x13, 0x84, + 0xb4, 0x03, 0x11, 0x31, 0xb4, 0x03, 0x11, 0x10, 0x4b, 0xfc, + 0xee, 0xef, 0x5a, 0x56, 0xd7, 0x2a, 0xc2, 0x83, 0x8a, 0x07, + 0xa1, 0x6f, 0xf9, 0x84, 0x26, 0x21, 0x67, 0xa5, 0xe2, 0x8a, + 0xc6, 0xb3, 0x52, 0x0e, 0xce, 0x21, 0x52, 0x0e, 0xce, 0x21, + 0xad, 0xf1, 0x31, 0xde, 0x28, 0x6a, 0x17, 0x5b, 0x92, 0xd0, + 0xf5, 0x33, 0xfa, 0x22, 0x54, 0xf7, 0x82, 0xc3, 0x71, 0x73, + 0x80, 0x48, 0x1f, 0x7a, 0x9c, 0x70, 0x74, 0xba, 0x98, 0x70, + 0x55, 0xba, 0x67, 0x8f, 0xaa, 0x45, 0x28, 0x9f, 0xb7, 0xcc, + 0x72, 0x83, 0x2a, 0xe7, 0x9c, 0xa6, 0xa3, 0x8f, 0x38, 0xe7, + 0x6d, 0x7d, 0x3d, 0xb0, 0x83, 0xfb, 0x23, 0xf1, 0x82, 0xff, + 0x2b, 0xb1, 0x83, 0xff, 0xd4, 0x4e, 0x7c, 0x00, 0x7c, 0x79, + 0x36, 0xcc, 0x2d, 0xf5, 0xae, 0x00, 0x8d, 0x7b, 0xa7, 0xcf, + 0xb2, 0xde, 0xa2, 0x7c, 0xf2, 0xd5, 0x95, 0x0d, 0x94, 0xb0, + 0x75, 0x26, 0x96, 0xf0, 0x35, 0x24, 0x69, 0x0f, 0xca, 0xdb, + 0x20, 0x9d, 0x8f, 0x24, 0xc9, 0x97, 0x2b, 0x1e, 0xf5, 0xaf, + 0x7a, 0x22, 0xa6, 0xa7, 0x1e, 0x76, 0xf4, 0x95, 0x7e, 0x7a, + 0x3d, 0x97, 0x3e, 0xda, 0xbd, 0x97, 0x3e, 0xfa, 0x42, 0x68, + 0xc1, 0x05, 0x78, 0xe2, 0x57, 0x31, 0xcd, 0xe2, 0x53, 0x25, + 0x2a, 0x66, 0x68, 0x7d, 0x3d, 0x09, 0xcf, 0x8a, 0x29, 0xb7, + 0xe3, 0x4c, 0x0b, 0x77, 0x09, 0x01, 0x09, 0x77, 0x49, 0x01, + 0xf6, 0x88, 0xb6, 0xfe, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0xac, 0x21, 0x34, + 0xf3, 0xbb, 0x7f, 0x8f, 0x29, 0x3a, 0x0f, 0x0a, 0xb4, 0xb8, + 0x5d, 0x00, 0x89, 0x59, 0xd7, 0x43, 0x0b, 0x3a, 0x64, 0x19, + 0x09, 0x3a, 0x65, 0x08, 0xf6, 0xc5, 0x9a, 0xf7, 0xb2, 0x0d, + 0xb7, 0xbd, 0xbc, 0x0d, 0x50, 0xca, 0xd4, 0x1d, 0x93, 0x44, + 0xc0, 0xa0, 0x14, 0x0f, 0xfe, 0x7a, 0x3c, 0x1b, 0x37, 0xba, + 0x61, 0x77, 0x37, 0xba, 0x70, 0x7f, 0xc8, 0x45, 0x8f, 0x80, + 0x2e, 0x33, 0xd8, 0x88, 0x61, 0xa4, 0xce, 0x30, 0xe1, 0xea, + 0x8c, 0x6f, 0xf2, 0x1f, 0x2d, 0xaa, 0xb0, 0x7b, 0x9c, 0x43, + 0xd1, 0xff, 0xed, 0x17, 0xd0, 0xff, 0xad, 0x13, 0x2f, 0x00, + 0x52, 0xec, 0x9c, 0xda, 0x41, 0x4d, 0xaa, 0x56, 0xab, 0x95, + 0x5e, 0x54, 0xb2, 0xae, 0x30, 0xcd, 0x2c, 0x5b, 0x12, 0x29, + 0xe0, 0xe4, 0x02, 0x41, 0xbb, 0x65, 0x02, 0x49, 0xb9, 0x64, + 0xfd, 0xb6, 0x46, 0x9b, 0x35, 0xf0, 0xb1, 0x56, 0xa9, 0x8e, + 0x45, 0xa5, 0xe7, 0x9e, 0xf1, 0xc1, 0x65, 0xa3, 0x38, 0x96, + 0x73, 0x29, 0xa8, 0x51, 0x54, 0x77, 0x31, 0x4a, 0x65, 0x77, + 0xb1, 0x43, 0x9a, 0x88, 0x4e, 0xbc, 0x32, 0x24, 0x99, 0x19, + 0xd4, 0x08, 0x13, 0x2f, 0xbf, 0x5b, 0x34, 0xfb, 0x2f, 0x93, + 0xd1, 0x9b, 0xdc, 0x70, 0x6e, 0x19, 0x56, 0x53, 0x44, 0xf3, + 0x5e, 0x53, 0x44, 0x7b, 0xa1, 0xac, 0xbb, 0x84, 0x08, 0xd9, + 0x8e, 0x10, 0xb1, 0x88, 0x4e, 0x33, 0x86, 0xdd, 0xc1, 0xbc, + 0x0a, 0x83, 0x84, 0x68, 0x75, 0x6f, 0x67, 0x77, 0x44, 0x85, + 0x07, 0x76, 0x44, 0x8f, 0x07, 0x77, 0xbb, 0x70, 0xf8, 0x88, + 0x83, 0x18, 0x11, 0xdb, 0xf2, 0xea, 0x5e, 0x78, 0x9d, 0xb4, + 0xa7, 0xd0, 0x77, 0x32, 0xfd, 0xb2, 0x90, 0xad, 0xb7, 0xdf, + 0x30, 0x02, 0xff, 0xd0, 0x20, 0x00, 0xff, 0xd0, 0xcf, 0xff, + 0x00, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x80, 0xde, 0x9b, 0x24, 0x79, 0xde, + 0x52, 0x83, 0xf4, 0x1f, 0x16, 0x0a, 0xa5, 0xf0, 0x7a, 0xfc, + 0xd9, 0x0d, 0x5f, 0x0d, 0xf9, 0x9a, 0x33, 0x4b, 0xf9, 0x9a, + 0x33, 0x09, 0x06, 0x65, 0xcc, 0xf6, 0x35, 0x08, 0x23, 0xaa, + 0x06, 0xb1, 0x5b, 0x50, 0xdf, 0xa4, 0xa3, 0xcd, 0xc0, 0x08, + 0xe6, 0x85, 0xcd, 0xb1, 0x2d, 0x1a, 0xdf, 0x32, 0xe6, 0x11, + 0xcf, 0x30, 0xa7, 0x11, 0x30, 0xcf, 0x58, 0xee, 0x4c, 0xad, + 0x9a, 0x46, 0xe1, 0x56, 0x69, 0x88, 0x9a, 0xbe, 0xe3, 0x93, + 0xa5, 0x8d, 0x59, 0x33, 0x0a, 0x86, 0x13, 0x46, 0x08, 0xa6, + 0x93, 0x86, 0x08, 0xa6, 0x93, 0x06, 0xf7, 0x59, 0x6c, 0xf9, + 0xc0, 0x96, 0x57, 0x1e, 0xef, 0xc6, 0x33, 0x25, 0x5d, 0x30, + 0x18, 0x77, 0x04, 0x7f, 0x63, 0x76, 0xc3, 0x81, 0x5d, 0x55, + 0xdf, 0xe5, 0xda, 0x74, 0xd7, 0xe5, 0xda, 0x74, 0x28, 0x1a, + 0x25, 0x8b, 0x58, 0xa9, 0xfa, 0x1c, 0xd9, 0xb2, 0xaa, 0xa8, + 0x52, 0x74, 0xba, 0x35, 0x97, 0xc3, 0x11, 0x61, 0x0d, 0x89, + 0x5b, 0x70, 0x11, 0xed, 0x57, 0x73, 0x15, 0xed, 0x53, 0x71, + 0xea, 0x12, 0xac, 0x8e, 0xaf, 0x5b, 0xae, 0xc5, 0x6d, 0x5b, + 0x68, 0xc1, 0x77, 0x6d, 0xba, 0x91, 0xfb, 0xd0, 0x12, 0x23, + 0xbe, 0x02, 0x12, 0xfc, 0x2a, 0xb2, 0x88, 0xfc, 0xaa, 0x92, + 0x98, 0xfc, 0x55, 0x6d, 0x67, 0x03, 0xaf, 0x1b, 0xbf, 0x3e, + 0xe4, 0x3c, 0x17, 0x8b, 0x4d, 0x1f, 0x23, 0xa6, 0x59, 0x00, + 0x3b, 0x4e, 0x54, 0x67, 0xe3, 0xc9, 0x5d, 0x46, 0x01, 0x4b, + 0x5d, 0x44, 0x21, 0x4b, 0xa2, 0xbb, 0xde, 0xb4, 0xcf, 0x48, + 0x2a, 0x76, 0x2d, 0xb7, 0x31, 0x7f, 0xc7, 0x35, 0xb1, 0x88, + 0x94, 0x5f, 0xa7, 0xe9, 0xde, 0xb7, 0xbf, 0x5a, 0x16, 0x37, + 0xb7, 0xdd, 0x96, 0x37, 0xb7, 0xd9, 0x69, 0xc8, 0x48, 0x26, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x35, 0x0b, 0x81, 0x6e, 0x7b, 0xa4, 0x51, 0x57, + 0x5a, 0xb1, 0xff, 0xd2, 0x53, 0x0a, 0xdc, 0xfc, 0x94, 0x89, + 0x24, 0x1a, 0x00, 0x9e, 0x78, 0x3b, 0x10, 0x9b, 0x38, 0x3b, + 0xef, 0x64, 0xc7, 0xc4, 0xa4, 0x4f, 0xee, 0x1a, 0x58, 0xb4, + 0x30, 0x49, 0x03, 0xc1, 0x50, 0xdc, 0xb1, 0xec, 0x5b, 0x94, + 0x83, 0x06, 0x8e, 0x2c, 0x43, 0x84, 0xd9, 0xac, 0x03, 0x84, + 0xda, 0xac, 0xfc, 0x7b, 0x25, 0x53, 0x0c, 0x64, 0x40, 0xb9, + 0x49, 0x48, 0x41, 0x1d, 0xca, 0x4a, 0x67, 0x57, 0x1c, 0x2e, + 0x2b, 0x2d, 0xb4, 0x15, 0x0b, 0xb5, 0x7c, 0x80, 0x96, 0x9d, + 0x7c, 0x80, 0x9a, 0x9d, 0x83, 0x7f, 0x65, 0x62, 0x57, 0x29, + 0x49, 0x40, 0xbf, 0x1a, 0xbc, 0xa9, 0xbb, 0xa4, 0xb0, 0xa9, + 0xe1, 0x39, 0x3b, 0xd6, 0xc7, 0x04, 0x4f, 0xc1, 0x41, 0x2e, + 0x2f, 0x85, 0x41, 0x2c, 0x2f, 0xc5, 0xbe, 0xd3, 0xd0, 0x3a, + 0xbd, 0x99, 0x11, 0x63, 0xd0, 0xbd, 0x9f, 0xf4, 0xda, 0x71, + 0xf1, 0xbd, 0xae, 0xc2, 0x1f, 0xf7, 0x4e, 0x12, 0x29, 0x54, + 0x20, 0x52, 0x79, 0xde, 0x28, 0x52, 0x79, 0xd6, 0xd7, 0xad, + 0x86, 0x29, 0x3d, 0xe0, 0xe2, 0x58, 0xe4, 0xb3, 0x40, 0x94, + 0x24, 0xe9, 0x10, 0xea, 0x70, 0x36, 0x37, 0x84, 0x37, 0xf6, + 0xd6, 0x32, 0x35, 0xfc, 0x0a, 0xba, 0x35, 0xfc, 0x1a, 0xba, + 0xca, 0x03, 0xe5, 0x45, 0x5c, 0xd7, 0x39, 0x71, 0x09, 0x3b, + 0x31, 0x93, 0x18, 0x99, 0x9b, 0x43, 0x33, 0x39, 0x62, 0x7e, + 0xd9, 0x02, 0x22, 0xed, 0xd9, 0x08, 0x2a, 0xfe, 0xd9, 0x08, + 0x22, 0xff, 0x26, 0xf7, 0xdd, 0x00, 0xf8, 0x80, 0xe6, 0x7a, + 0x12, 0xe5, 0x3b, 0x51, 0x04, 0xf9, 0xcc, 0x3f, 0xcb, 0x42, + 0xeb, 0x28, 0xae, 0xee, 0x02, 0xd5, 0x0d, 0x18, 0x27, 0x15, + 0x0f, 0x18, 0x26, 0x15, 0xf0, 0xe7, 0xd9, 0xea, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x4b, 0x1d, 0x31, 0xeb, 0x9e, 0xea, 0x2f, 0x38, 0x01, 0x30, + 0xbe, 0x26, 0x94, 0x22, 0x25, 0x73, 0x29, 0x04, 0xe7, 0x8a, + 0x85, 0x80, 0x77, 0xfe, 0x84, 0x80, 0x77, 0xfe, 0x7b, 0x7f, + 0x88, 0x01, 0x9d, 0x4e, 0xd9, 0xec, 0xa2, 0xd8, 0xd5, 0xa9, + 0x8d, 0x42, 0x0f, 0x5e, 0x6f, 0xc6, 0xee, 0xfe, 0xca, 0x2c, + 0xd6, 0x5f, 0x66, 0xcd, 0xcf, 0xf2, 0x66, 0x8d, 0xcf, 0xde, + 0x99, 0x72, 0x30, 0x21, 0xc2, 0xfb, 0xea, 0x85, 0xb3, 0x92, + 0xf2, 0xf9, 0xa3, 0xc8, 0x45, 0xd3, 0x7d, 0x19, 0xf6, 0xcd, + 0xd4, 0xc3, 0x45, 0xe7, 0x54, 0xe3, 0x47, 0xdf, 0x54, 0xc3, + 0x47, 0xdf, 0xab, 0x3c, 0xb8, 0x20, 0x21, 0x36, 0xfa, 0xa6, + 0x35, 0x15, 0x0e, 0xf0, 0x28, 0x44, 0xba, 0x93, 0x70, 0xc8, + 0x9e, 0xaf, 0x3c, 0x25, 0x31, 0x0c, 0xdc, 0x55, 0xf0, 0xee, + 0x1c, 0x45, 0xf0, 0xee, 0xe3, 0xba, 0x0f, 0x11, 0xe0, 0x28, + 0xf4, 0x7e, 0xe5, 0x04, 0xa8, 0xe5, 0x5e, 0x48, 0x1b, 0xc0, + 0x8e, 0x17, 0x05, 0xa3, 0x2d, 0xd0, 0x3c, 0xf0, 0x6d, 0x89, + 0xb4, 0xe0, 0x6d, 0x88, 0xb4, 0xe0, 0x92, 0x77, 0x4b, 0x1f, + 0xc2, 0x0f, 0x82, 0x73, 0xb7, 0x27, 0x88, 0x2e, 0x27, 0x3e, + 0x89, 0xfb, 0x61, 0x37, 0xdd, 0x66, 0xb7, 0x19, 0x1e, 0x36, + 0x03, 0x59, 0x1c, 0xee, 0x23, 0x19, 0x1c, 0xa6, 0xdc, 0xe6, + 0xe3, 0x59, 0xa0, 0x0c, 0x00, 0xd8, 0x55, 0xd9, 0x56, 0x47, + 0x6e, 0x8b, 0xc9, 0xb5, 0xd7, 0x39, 0xfb, 0x1a, 0x4d, 0xbc, + 0xe5, 0x65, 0x4c, 0x53, 0xc5, 0x60, 0x4c, 0x19, 0xe5, 0x60, + 0xb3, 0xe6, 0x1a, 0x9f, 0xf7, 0x25, 0x77, 0x24, 0x51, 0x25, + 0x46, 0x11, 0x38, 0x62, 0xc7, 0x6b, 0x52, 0x9a, 0x56, 0x9b, + 0x22, 0x56, 0xfa, 0xd4, 0xaa, 0xb2, 0x53, 0x0d, 0x2a, 0xb2, + 0x53, 0x0d, 0xd5, 0x4d, 0xac, 0xf2, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x64, + 0x3e, 0x18, 0x3b, 0xcb, 0xeb, 0xf2, 0x02, 0x44, 0x7b, 0x05, + 0x15, 0x20, 0x2e, 0x45, 0x8f, 0x10, 0x33, 0xb6, 0x83, 0xe0, + 0xb2, 0x9d, 0x83, 0xe0, 0xb2, 0x9d, 0x7c, 0x1f, 0x4d, 0x62, + 0x8e, 0x8a, 0x04, 0x60, 0x17, 0x58, 0xc2, 0x72, 0x0f, 0x5a, + 0x00, 0x4e, 0xcc, 0xc9, 0x4f, 0xbf, 0xda, 0x59, 0x02, 0x39, + 0x4b, 0xd5, 0x76, 0x29, 0x4b, 0xd1, 0x56, 0x29, 0xb4, 0x2e, + 0xa9, 0xd6, 0x9f, 0x51, 0x50, 0xd6, 0xf7, 0x24, 0x06, 0xa5, + 0x0f, 0x2a, 0x25, 0xe3, 0x7b, 0xf1, 0x23, 0xb6, 0x2e, 0xa1, + 0xf2, 0xeb, 0xbe, 0xed, 0x60, 0xcf, 0xbe, 0xed, 0x60, 0x8f, + 0x41, 0x12, 0x9f, 0x70, 0x4a, 0xb0, 0x50, 0x89, 0xe3, 0xe3, + 0xd0, 0x22, 0x7e, 0x8f, 0x69, 0x3e, 0x35, 0x72, 0x52, 0xf7, + 0x35, 0x03, 0xed, 0xbf, 0xb5, 0x02, 0xcc, 0xbf, 0x35, 0x02, + 0xcc, 0xbf, 0xca, 0xfd, 0x33, 0x40, 0xb2, 0x0d, 0x23, 0xcc, + 0xf6, 0xcf, 0x89, 0xb3, 0x07, 0x63, 0x92, 0xad, 0x47, 0x3d, + 0x7d, 0xd3, 0xb3, 0x7e, 0xf0, 0x2c, 0xa7, 0x93, 0xe7, 0x98, + 0xa7, 0xbf, 0xf4, 0x89, 0x58, 0x40, 0x0b, 0x76, 0xda, 0x21, + 0x50, 0xb3, 0x08, 0x36, 0xcb, 0x8d, 0xd7, 0x74, 0xec, 0xaf, + 0x56, 0x85, 0x2c, 0xf6, 0x28, 0xab, 0xbf, 0xf8, 0xf4, 0x09, + 0x8b, 0xe2, 0xf4, 0x09, 0xaf, 0xe2, 0x0b, 0xf6, 0x50, 0x1d, + 0x56, 0x3e, 0xd4, 0x03, 0x57, 0x23, 0xf4, 0xa3, 0x10, 0xb0, + 0x8d, 0x06, 0x19, 0x69, 0x35, 0x6f, 0xfa, 0xc8, 0x37, 0x69, + 0x3f, 0x5d, 0x35, 0x6c, 0x3f, 0x88, 0x35, 0x6d, 0xc0, 0x77, + 0xca, 0x92, 0xc8, 0x3f, 0xce, 0x0c, 0xba, 0x1f, 0x44, 0xb7, + 0x68, 0xfb, 0xb8, 0x4a, 0xfa, 0x9a, 0x94, 0x72, 0x98, 0x40, + 0x89, 0xe2, 0x4f, 0x55, 0x98, 0xc2, 0xcf, 0x53, 0x98, 0xc2, + 0x30, 0xac, 0x67, 0x3d, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x43, 0x91, 0x3d, + 0xe4, 0x75, 0xe0, 0xb3, 0x42, 0x9e, 0x5c, 0x75, 0x28, 0x41, + 0x25, 0xfc, 0x3e, 0xa4, 0xc7, 0x67, 0x23, 0xc8, 0x44, 0x67, + 0x23, 0xc8, 0x44, 0x67, 0xdc, 0x37, 0xbb, 0x98, 0x18, 0xb7, + 0x3f, 0x2f, 0x0c, 0x34, 0x2e, 0xc8, 0x9f, 0x42, 0xa1, 0xc0, + 0x5b, 0x8a, 0xb0, 0x48, 0xc0, 0xbe, 0x45, 0x80, 0x4c, 0x3e, + 0xa0, 0x44, 0x4c, 0x3e, 0xa0, 0x40, 0xb3, 0xc1, 0x5f, 0xbf, + 0xb8, 0x25, 0xf4, 0xa7, 0x68, 0xb3, 0x3f, 0x24, 0x90, 0xf4, + 0xf8, 0x21, 0x7f, 0x86, 0x5d, 0xa6, 0xb4, 0xc9, 0x2b, 0x64, + 0xb9, 0xd0, 0xed, 0xac, 0xb9, 0xd0, 0xe9, 0x2c, 0x46, 0x2f, + 0x16, 0xd3, 0xd3, 0x0d, 0xa5, 0xf5, 0xb9, 0x05, 0xea, 0x11, + 0xfb, 0x92, 0xf6, 0x73, 0xc8, 0xea, 0xcd, 0xe8, 0x0e, 0xe0, + 0x65, 0x8f, 0xfc, 0x61, 0x14, 0xcc, 0xfc, 0x61, 0x54, 0xcc, + 0x03, 0x9e, 0xab, 0x33, 0xda, 0x88, 0xe9, 0xf2, 0xfd, 0xc8, + 0x19, 0x34, 0xde, 0x55, 0x52, 0x7b, 0xe8, 0xbe, 0xd3, 0xf8, + 0xa4, 0x55, 0x24, 0xc4, 0xe9, 0x85, 0xe7, 0xf9, 0xe9, 0x05, + 0x67, 0xfd, 0x16, 0xfa, 0x98, 0x02, 0xdd, 0xa3, 0x23, 0xec, + 0xd5, 0xf7, 0x75, 0x13, 0x5b, 0x8d, 0x54, 0x02, 0x6c, 0xc2, + 0x1e, 0x59, 0xc0, 0xfb, 0x5b, 0xc6, 0xf9, 0x5a, 0x77, 0xad, + 0xf8, 0xda, 0x7f, 0xe9, 0x07, 0x25, 0x80, 0x16, 0xaa, 0x2f, + 0xe7, 0xf3, 0x3b, 0x7d, 0x9d, 0x37, 0x97, 0x81, 0xa0, 0xa8, + 0x10, 0x43, 0xe4, 0x3a, 0x4f, 0xed, 0x38, 0xf2, 0x87, 0xe5, + 0x27, 0x36, 0x87, 0xe5, 0x25, 0x32, 0x78, 0x1a, 0xda, 0xcd, + 0x5d, 0x74, 0xe6, 0xf6, 0x2a, 0x7b, 0xd2, 0x57, 0x76, 0x48, + 0xe5, 0x26, 0x6b, 0x86, 0x70, 0x86, 0x56, 0xe3, 0x6a, 0xd7, + 0x12, 0x60, 0xce, 0x26, 0x16, 0x60, 0xea, 0x26, 0xe9, 0x9f, + 0x15, 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x39, 0xda, 0x11, 0x7f, 0x2c, 0x42, + 0xe4, 0x4b, 0xb8, 0xe0, 0x90, 0xd0, 0xbb, 0x18, 0x04, 0xc2, + 0x7b, 0x8f, 0x86, 0x4d, 0xa5, 0x8d, 0xa6, 0x50, 0xa9, 0x8d, + 0x86, 0x40, 0x56, 0x72, 0x79, 0xbf, 0xe5, 0x18, 0x44, 0x8b, + 0x1d, 0x18, 0xd4, 0x81, 0xa8, 0xed, 0xc3, 0x76, 0x96, 0x3b, + 0xd8, 0x31, 0x74, 0x15, 0x2c, 0x58, 0x46, 0xf5, 0x3e, 0x55, + 0x46, 0x55, 0x3c, 0x55, 0xb9, 0xaa, 0xc3, 0xaa, 0xcf, 0xa1, + 0xf7, 0x76, 0x8d, 0x9c, 0xef, 0x9a, 0xde, 0x2b, 0x7a, 0x49, + 0xc1, 0xc1, 0x38, 0xc8, 0x63, 0x45, 0x42, 0xeb, 0x67, 0x8d, + 0xb7, 0x79, 0x67, 0x85, 0x32, 0x79, 0x98, 0x7a, 0xcd, 0x86, + 0x05, 0xc0, 0xc6, 0xa4, 0xd0, 0xb0, 0x70, 0xd0, 0xed, 0x54, + 0xea, 0x94, 0xf4, 0xaa, 0x1d, 0x75, 0x91, 0x86, 0x3e, 0x6e, + 0x1c, 0xeb, 0x0e, 0x6e, 0x94, 0xeb, 0x2e, 0x6e, 0x6b, 0x14, + 0xd1, 0x91, 0x6a, 0xde, 0x6a, 0xcb, 0x82, 0x57, 0xf1, 0x95, + 0x49, 0xff, 0xe4, 0xf4, 0xd4, 0x20, 0xcc, 0xbd, 0x2c, 0xa4, + 0x80, 0x77, 0x05, 0x0c, 0xdb, 0xbe, 0x05, 0x0c, 0xd9, 0x3e, + 0xfa, 0xf3, 0x26, 0xc1, 0x9f, 0x92, 0x00, 0x8b, 0x5c, 0xeb, + 0x6a, 0xd2, 0x90, 0xab, 0x56, 0x2a, 0xf7, 0xc0, 0x64, 0x11, + 0xc6, 0xbc, 0x97, 0x60, 0x90, 0xbc, 0x54, 0x40, 0x90, 0xbc, + 0x54, 0x40, 0x6f, 0x43, 0xab, 0xbf, 0x8a, 0xc4, 0xce, 0x3f, + 0xae, 0x5b, 0xc5, 0x96, 0x94, 0xa0, 0x67, 0xa2, 0xc7, 0x75, + 0x8d, 0x84, 0x30, 0xdb, 0xa1, 0x8b, 0x4e, 0xfb, 0xe1, 0xd8, + 0x4e, 0xfb, 0xa1, 0x88, 0xb1, 0x04, 0x5e, 0x77, 0x8a, 0x55, + 0xbf, 0xfb, 0x04, 0xfe, 0x21, 0x99, 0x89, 0xbe, 0x4a, 0x99, + 0xb5, 0x42, 0x06, 0x84, 0xbc, 0xe0, 0xf0, 0xfc, 0x85, 0x00, + 0x05, 0xc3, 0x85, 0x00, 0x04, 0xc3, 0x7a, 0xff, 0xfb, 0x3c, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x22, 0xda, 0x8d, 0x48, 0xe1, 0x99, 0x28, 0x49, + 0x90, 0x69, 0x60, 0xbe, 0xc5, 0x88, 0x99, 0x9a, 0x1f, 0x3a, + 0x78, 0xf0, 0xcb, 0x31, 0x68, 0xc3, 0xcb, 0x31, 0x68, 0xc1, + 0x34, 0xce, 0x97, 0x3e, 0x34, 0xff, 0xe7, 0x3a, 0xe6, 0xe1, + 0xff, 0xe7, 0xaf, 0x75, 0x3d, 0xb9, 0x2a, 0x06, 0x5b, 0xae, + 0x87, 0x59, 0x16, 0xd3, 0xc3, 0x4c, 0x37, 0x53, 0xc3, 0x4c, + 0x37, 0xd3, 0x3c, 0xb3, 0xc8, 0x2c, 0x08, 0xdd, 0x74, 0x06, + 0x33, 0xf9, 0x61, 0xcc, 0x7c, 0x79, 0xfd, 0x3a, 0x6d, 0x4a, + 0x40, 0xb5, 0xea, 0xf0, 0x78, 0x98, 0xec, 0xda, 0x3d, 0xaa, + 0xec, 0xda, 0x3d, 0xaa, 0x13, 0x25, 0xc2, 0x55, 0xf2, 0x87, + 0x14, 0x8b, 0xb6, 0x29, 0x44, 0xe1, 0xb1, 0x10, 0x9b, 0x13, + 0x2f, 0xae, 0xb6, 0x63, 0x8f, 0x8c, 0xce, 0xab, 0x4e, 0x0f, + 0x12, 0xab, 0x4f, 0x0c, 0x92, 0xab, 0xb0, 0xf3, 0x6d, 0x54, + 0x55, 0x95, 0x9b, 0x2a, 0xf5, 0x17, 0xd0, 0x79, 0x5c, 0xc5, + 0x3a, 0xbe, 0xae, 0xf6, 0xb2, 0x68, 0x0a, 0x20, 0xc8, 0xd6, + 0x08, 0xc2, 0xcb, 0x68, 0x0a, 0xc0, 0xcb, 0x68, 0xf5, 0x3f, + 0x34, 0x97, 0x4c, 0x52, 0x17, 0x9b, 0x91, 0xcb, 0x84, 0x19, + 0x7e, 0x61, 0x1a, 0x45, 0x0b, 0x1f, 0x6c, 0x8a, 0xb4, 0x03, + 0xb9, 0x8f, 0x95, 0xa3, 0xba, 0x8d, 0x95, 0x23, 0xba, 0x8d, + 0x6a, 0xdc, 0x45, 0x72, 0x0d, 0xf0, 0x5b, 0x74, 0xe2, 0x5a, + 0x5b, 0xd0, 0x7b, 0x9b, 0x64, 0xe8, 0xb5, 0xbf, 0x72, 0x25, + 0xba, 0xd7, 0x2a, 0x2c, 0xbf, 0xa5, 0xc1, 0xfc, 0xbf, 0xa7, + 0x40, 0x7c, 0x40, 0x58, 0xbf, 0x83, 0x00, 0xbc, 0xc0, 0xc5, + 0xfc, 0xcc, 0x87, 0xf6, 0x82, 0xdb, 0x7f, 0x04, 0x38, 0xbe, + 0x87, 0xee, 0xae, 0x59, 0xc0, 0x5a, 0x3c, 0x5d, 0xc8, 0x56, + 0x3c, 0x5d, 0xc8, 0x5e, 0xc3, 0xa2, 0x37, 0xa1, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xc1, 0x7c, 0xa3, 0x86, 0x61, 0x37, 0x8d, 0xb0, 0xf0, 0xb9, + 0x6b, 0x2c, 0x31, 0xfa, 0xdf, 0xe6, 0x75, 0xbd, 0x0e, 0x31, + 0xb9, 0xb9, 0x9b, 0x20, 0xf1, 0xb9, 0x9f, 0x20, 0x0e, 0x46, + 0x60, 0xdf, 0xd2, 0x5e, 0xd3, 0xa5, 0xd5, 0xf2, 0xc8, 0xef, + 0xb3, 0x18, 0x09, 0x91, 0xaf, 0x4b, 0x40, 0x3b, 0x7a, 0xd1, + 0x73, 0xfb, 0x6e, 0x19, 0xf5, 0xe1, 0x7e, 0x19, 0xf5, 0xe1, + 0x81, 0xe6, 0x0a, 0x1e, 0x2a, 0x1e, 0x4e, 0x81, 0x9c, 0xf4, + 0x27, 0x36, 0x19, 0xba, 0xea, 0x28, 0x92, 0x1e, 0x34, 0x6b, + 0xfb, 0x5b, 0x58, 0x2e, 0x62, 0xaa, 0x3a, 0x34, 0x62, 0xaa, + 0x38, 0x34, 0x9d, 0x55, 0xc7, 0xcb, 0x89, 0x87, 0xa7, 0x3b, + 0xd6, 0x28, 0x54, 0xee, 0x3d, 0x26, 0x3d, 0x67, 0x47, 0x88, + 0xe6, 0x9a, 0xf4, 0x5a, 0x1a, 0x10, 0x27, 0xc0, 0x16, 0x9a, + 0x27, 0x48, 0x16, 0x9a, 0xd8, 0xb7, 0xe9, 0x65, 0x82, 0x75, + 0x6c, 0xad, 0xd8, 0xfe, 0x32, 0x42, 0x39, 0x26, 0x17, 0x9b, + 0x35, 0xf4, 0x6d, 0x26, 0x45, 0x9f, 0x6b, 0x5c, 0x50, 0x1d, + 0x22, 0x55, 0x50, 0x1d, 0x62, 0x45, 0xaf, 0xe2, 0x9d, 0xba, + 0x1d, 0x4e, 0xf8, 0xcb, 0x37, 0x49, 0x44, 0xcd, 0x29, 0xe3, + 0xf8, 0xfd, 0xa3, 0x23, 0x64, 0x27, 0x05, 0x49, 0xee, 0x3e, + 0xa5, 0x26, 0xec, 0x5f, 0xa5, 0x26, 0xec, 0x5f, 0x5a, 0xd9, + 0x13, 0xa0, 0xcb, 0xf5, 0x5a, 0xd2, 0x63, 0xec, 0xfb, 0xba, + 0x79, 0x40, 0x19, 0x76, 0xcc, 0xed, 0x49, 0x94, 0x2f, 0x43, + 0x01, 0x61, 0xa5, 0x45, 0x27, 0x76, 0xa5, 0x45, 0x07, 0x72, + 0x5a, 0xba, 0xf8, 0x8d, 0xdc, 0x78, 0x48, 0xb6, 0x79, 0xf7, + 0x3f, 0x87, 0x1c, 0xb1, 0x2c, 0xc6, 0xef, 0x6c, 0x62, 0x37, + 0x05, 0x08, 0xe7, 0xcc, 0xe5, 0x08, 0xae, 0x81, 0xc5, 0x08, + 0xae, 0x84, 0x3a, 0xf7, 0x51, 0x7b, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6a, 0x83, + 0xce, 0x4a, 0xa5, 0x2a, 0xf8, 0xbb, 0x6d, 0x45, 0x91, 0x4d, + 0x62, 0x8a, 0xd2, 0xb0, 0x02, 0xbc, 0x85, 0xc4, 0x53, 0x0a, + 0x16, 0x59, 0x53, 0x08, 0x96, 0x59, 0xac, 0xf7, 0x69, 0xa6, + 0xe9, 0x7e, 0xd1, 0x6c, 0x31, 0x53, 0x3f, 0xf9, 0xdd, 0x63, + 0x8c, 0x31, 0x2f, 0xb4, 0xaf, 0xe2, 0x85, 0x43, 0x81, 0x3e, + 0xd5, 0x66, 0x29, 0x6a, 0xd5, 0x66, 0x29, 0x6a, 0x2a, 0x99, + 0xd6, 0x95, 0x01, 0xea, 0x7a, 0x41, 0x90, 0x79, 0xdd, 0x28, + 0x88, 0x12, 0x58, 0x28, 0x7f, 0x3c, 0x30, 0x76, 0xdd, 0x22, + 0x71, 0x46, 0x5a, 0x22, 0xb3, 0x74, 0x5e, 0x22, 0x31, 0x64, + 0xa1, 0xdd, 0xce, 0x9b, 0xc9, 0x34, 0x49, 0xe9, 0x23, 0x17, + 0xa7, 0xe5, 0xe2, 0x03, 0x60, 0xb3, 0x65, 0x24, 0x24, 0xf8, + 0x79, 0xeb, 0xa9, 0xf0, 0x08, 0xe3, 0x05, 0xfa, 0x48, 0xe3, + 0x05, 0xf8, 0xb7, 0x1c, 0xfa, 0x07, 0xb7, 0x5f, 0x5f, 0x26, + 0x52, 0xde, 0x36, 0x18, 0x87, 0x3d, 0x4d, 0xc4, 0x84, 0x61, + 0x2e, 0x5b, 0xfa, 0x02, 0x43, 0xae, 0x1e, 0xa7, 0x0a, 0xa6, + 0x9e, 0xa7, 0x0a, 0xa4, 0x61, 0x58, 0xf5, 0x5b, 0xce, 0x40, + 0xf0, 0xe3, 0x30, 0xb1, 0x0d, 0xca, 0xdc, 0x63, 0x69, 0x1a, + 0xf1, 0x3c, 0x88, 0x5a, 0xd1, 0xd1, 0xde, 0x33, 0xa0, 0x71, + 0xac, 0x82, 0xb0, 0x71, 0x8c, 0x82, 0x4f, 0x8e, 0x73, 0x7d, + 0x15, 0x7e, 0x14, 0x61, 0x32, 0x62, 0x14, 0x08, 0x5b, 0x91, + 0x64, 0x3b, 0xdc, 0x7c, 0x64, 0x6c, 0x09, 0x92, 0xf0, 0xda, + 0x2d, 0x56, 0x64, 0xfc, 0x0d, 0x56, 0x66, 0xfc, 0xf2, 0xa9, + 0x9b, 0x03, 0x1e, 0x93, 0xf3, 0x4c, 0xd4, 0x2c, 0x98, 0xa2, + 0x8f, 0xfd, 0x9a, 0xd3, 0xc8, 0xe7, 0x9f, 0x9e, 0xb1, 0x14, + 0x6b, 0x98, 0x21, 0x14, 0x0b, 0x08, 0x31, 0x14, 0x0b, 0x08, + 0xce, 0xeb, 0xf4, 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb2, 0x73, 0xc5, 0x06, + 0xe6, 0xce, 0xd2, 0xe7, 0xd5, 0x7d, 0xf1, 0x55, 0xcc, 0xd9, + 0xf2, 0x91, 0xeb, 0x6e, 0x04, 0x02, 0x80, 0x64, 0xf3, 0x52, + 0xc0, 0x64, 0xf3, 0x12, 0x3f, 0x9b, 0x0c, 0xed, 0x03, 0x29, + 0xfb, 0x5d, 0xa6, 0x30, 0xd7, 0x96, 0xb2, 0xa5, 0xfa, 0x9d, + 0x80, 0x2c, 0xef, 0xed, 0xee, 0x93, 0x9c, 0x89, 0xa6, 0xf7, + 0x95, 0x9b, 0xa6, 0xb7, 0x95, 0x8b, 0x59, 0x48, 0x6a, 0x74, + 0x89, 0x28, 0x26, 0xa3, 0x73, 0xd3, 0xf4, 0xd6, 0x55, 0xce, + 0x4b, 0xcb, 0x6a, 0x2e, 0x64, 0x64, 0x16, 0x4e, 0xb1, 0xd0, + 0xb6, 0xed, 0xba, 0xdc, 0xb6, 0xee, 0xba, 0xdc, 0x49, 0x11, + 0x45, 0x23, 0x4c, 0x0c, 0x8e, 0x70, 0x70, 0x4a, 0x53, 0x5e, + 0x52, 0xeb, 0x38, 0xf2, 0xb9, 0x4f, 0x91, 0xee, 0x4a, 0xff, + 0x4a, 0xa8, 0x8c, 0xee, 0x80, 0xea, 0x0c, 0xee, 0x80, 0xea, + 0xf3, 0x11, 0x7f, 0x15, 0xbd, 0x66, 0xe5, 0x4a, 0xf7, 0x6d, + 0xa2, 0xbe, 0xfe, 0x87, 0xc1, 0xc9, 0x24, 0xa4, 0x7e, 0xb3, + 0x68, 0x06, 0xda, 0xad, 0xa0, 0x17, 0x52, 0xa9, 0xa4, 0x06, + 0x52, 0xa9, 0x5b, 0xf9, 0xad, 0x56, 0xad, 0xe7, 0x3e, 0x5e, + 0x0b, 0x9c, 0xcf, 0x74, 0xb4, 0x32, 0x8e, 0x88, 0xb6, 0x76, + 0x89, 0xf5, 0x85, 0x7f, 0x0a, 0x0f, 0x14, 0xb2, 0x29, 0x5f, + 0x14, 0xb6, 0x29, 0x5f, 0xeb, 0x49, 0xd6, 0xa0, 0x1f, 0x48, + 0x4b, 0xac, 0x49, 0x5b, 0xcb, 0x12, 0x6d, 0xc2, 0xda, 0x96, + 0x7c, 0xda, 0xfb, 0xbe, 0xd6, 0xf8, 0xaa, 0xfe, 0x53, 0x7a, + 0xe2, 0xfe, 0x5b, 0xfa, 0xea, 0xfe, 0xa4, 0x05, 0x15, 0x01, + 0x9a, 0xc6, 0xf6, 0x49, 0x9c, 0x36, 0x7c, 0xe7, 0xc6, 0x33, + 0x73, 0xd3, 0xf4, 0x03, 0xe3, 0x48, 0x56, 0x5d, 0x55, 0x1f, + 0x54, 0x13, 0x8f, 0xbe, 0x54, 0x13, 0x87, 0x9e, 0xab, 0xec, + 0x78, 0x61, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x40, 0x6a, 0xf8, 0x06, 0xc9, 0xa9, + 0xdd, 0x4b, 0x2f, 0x50, 0x9c, 0x1a, 0xeb, 0xb6, 0x24, 0xa9, + 0x73, 0xa2, 0xb0, 0x68, 0xbd, 0x82, 0xbc, 0x2c, 0xbf, 0x82, + 0xbc, 0x2c, 0x40, 0x7d, 0x43, 0xd3, 0xac, 0xfb, 0x7d, 0x7c, + 0x66, 0x6e, 0xe5, 0x0c, 0x97, 0xc8, 0x05, 0x2b, 0xcd, 0x02, + 0xdf, 0xf0, 0xcc, 0x43, 0x82, 0xbc, 0x89, 0x4a, 0xe7, 0xb1, + 0x8d, 0x4a, 0xc7, 0xb1, 0x72, 0xb5, 0x38, 0x4e, 0x08, 0x40, + 0x79, 0x74, 0x29, 0x9f, 0xbc, 0xb2, 0x4f, 0x81, 0x63, 0x49, + 0x1f, 0xb6, 0x70, 0x5b, 0x88, 0xe2, 0x47, 0x8e, 0xe8, 0xe0, + 0x04, 0xa2, 0xec, 0xe0, 0x04, 0xa2, 0x13, 0x1f, 0xfb, 0x5d, + 0x5e, 0xe2, 0x0d, 0x6e, 0x8f, 0x54, 0x00, 0x39, 0x79, 0x88, + 0x5f, 0x96, 0xa3, 0x30, 0xaa, 0x8b, 0x11, 0x56, 0xba, 0xf5, + 0x20, 0xe6, 0x3a, 0x4e, 0x20, 0xf6, 0xba, 0x4f, 0xdf, 0x09, + 0x45, 0xb0, 0x61, 0x3a, 0x89, 0x85, 0x99, 0x15, 0x41, 0x4b, + 0xea, 0x2f, 0xa1, 0x35, 0xba, 0x23, 0xe2, 0xd8, 0x0a, 0x43, + 0xa9, 0xc6, 0x68, 0x21, 0xa9, 0xd4, 0x6a, 0x23, 0xa9, 0xd0, + 0x95, 0xdc, 0x56, 0x2f, 0x22, 0x82, 0x8d, 0xa1, 0x5f, 0xd8, + 0xc0, 0x9f, 0x61, 0x63, 0x1b, 0xe4, 0x7c, 0x40, 0xa1, 0x32, + 0xbf, 0x0f, 0x79, 0x56, 0x3f, 0x4e, 0x19, 0x22, 0x3f, 0x4e, + 0x19, 0x26, 0xc0, 0xb1, 0xe6, 0xd9, 0x73, 0x1c, 0x55, 0xa0, + 0x61, 0xdd, 0xe3, 0x4b, 0x97, 0xb1, 0xe1, 0x31, 0x26, 0xd5, + 0xfb, 0xaa, 0x86, 0x65, 0x41, 0xab, 0x82, 0xcc, 0xc2, 0x3b, + 0x82, 0xcc, 0x83, 0x3b, 0x7d, 0x33, 0x3c, 0xc4, 0x81, 0xa5, + 0xf5, 0xe1, 0x9a, 0x9d, 0xba, 0x62, 0x10, 0xf7, 0xcd, 0x1f, + 0x63, 0x15, 0x35, 0x79, 0x23, 0x09, 0xf8, 0x18, 0x69, 0x59, + 0xfc, 0x58, 0x63, 0x59, 0xfc, 0x58, 0x9c, 0xa6, 0x03, 0xa7, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x3b, 0x3b, 0xd4, 0x29, 0xf6, 0x00, 0x55, + 0x52, 0xb9, 0x1d, 0x70, 0x46, 0x8d, 0x66, 0xdd, 0x55, 0xab, + 0xa6, 0x7f, 0x50, 0xb1, 0xee, 0xff, 0x10, 0xa1, 0xee, 0xff, + 0xef, 0x5e, 0x11, 0x00, 0x76, 0x2a, 0x8b, 0x86, 0x12, 0x2c, + 0xb8, 0x77, 0x7a, 0xc5, 0x6d, 0x83, 0x79, 0x58, 0x39, 0x84, + 0x86, 0xc4, 0x99, 0x4e, 0x3c, 0xb4, 0xd4, 0x40, 0x3c, 0xb4, + 0x94, 0x40, 0xc3, 0x4b, 0x6b, 0xbf, 0x01, 0xe4, 0x14, 0x40, + 0x64, 0xa1, 0x03, 0x58, 0xa7, 0x78, 0x65, 0x9b, 0x20, 0xd0, + 0x0e, 0x64, 0xd5, 0xdf, 0x5f, 0x88, 0x65, 0xfe, 0x6b, 0xa5, + 0x65, 0xfe, 0x6f, 0xa5, 0x9a, 0x01, 0x90, 0x5a, 0x44, 0x37, + 0xeb, 0xcf, 0xf0, 0x23, 0x0a, 0x0c, 0x55, 0xce, 0x91, 0x22, + 0xf7, 0xdd, 0xc8, 0x4e, 0xa7, 0xfa, 0x08, 0xaf, 0x9f, 0xd5, + 0x48, 0xd0, 0xb7, 0xdd, 0x48, 0xdd, 0x48, 0x22, 0xb7, 0x22, + 0xf6, 0x4d, 0x6d, 0x15, 0x05, 0xdb, 0x34, 0xb2, 0xa6, 0x73, + 0xa1, 0x01, 0x0c, 0x6e, 0x35, 0xd6, 0x58, 0x18, 0x80, 0x9c, + 0x4a, 0x08, 0x08, 0xb4, 0x48, 0x08, 0x08, 0xb4, 0xb7, 0xf7, + 0xf7, 0x4b, 0xd8, 0x1a, 0xc3, 0x7b, 0xd2, 0x74, 0x64, 0x21, + 0x0b, 0xc6, 0xd6, 0xb0, 0x68, 0x2a, 0xf3, 0xab, 0x74, 0x17, + 0x59, 0x69, 0x7c, 0x12, 0x88, 0xeb, 0x7c, 0x12, 0x98, 0xeb, + 0x83, 0xed, 0x67, 0x14, 0xd8, 0x4e, 0xad, 0x85, 0x03, 0xda, + 0x33, 0x63, 0xf3, 0x94, 0xdb, 0x23, 0xe1, 0x5d, 0x5f, 0x21, + 0x7b, 0xfd, 0x27, 0x1d, 0xec, 0x95, 0x1f, 0x53, 0xf9, 0x95, + 0x1f, 0x43, 0x06, 0x6a, 0xe0, 0xbc, 0x41, 0x9e, 0x3d, 0x77, + 0xc5, 0x9e, 0x5d, 0xba, 0x2b, 0x53, 0x0d, 0xae, 0xda, 0x92, + 0x81, 0x9c, 0x8a, 0xd6, 0x4b, 0xcb, 0xc6, 0x99, 0x46, 0xac, + 0xc6, 0x9d, 0x47, 0xac, 0x39, 0x62, 0xb8, 0x53, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x82, 0x87, 0x45, 0x45, 0x86, 0xdc, 0xb9, 0xe2, 0xca, 0x9b, + 0x84, 0xbb, 0x09, 0x78, 0x1b, 0xcf, 0x21, 0x2d, 0x2f, 0x53, + 0x85, 0xbe, 0x3e, 0xfb, 0x81, 0xbd, 0x3e, 0xfb, 0x7e, 0x42, + 0xc1, 0x04, 0xbb, 0x16, 0xd2, 0x57, 0x65, 0xb9, 0xde, 0x74, + 0x57, 0x17, 0xea, 0x24, 0x58, 0xf8, 0xb5, 0x62, 0x82, 0x5b, + 0x05, 0xbd, 0x97, 0x47, 0x64, 0x21, 0x93, 0x53, 0xe0, 0x21, + 0x6c, 0xac, 0x1f, 0xde, 0xee, 0x00, 0x50, 0x14, 0x63, 0xc9, + 0x80, 0xb5, 0xb0, 0x6c, 0x69, 0xfb, 0xcb, 0x54, 0xb3, 0x52, + 0x28, 0xa1, 0xde, 0x66, 0x32, 0x71, 0xa0, 0x52, 0x22, 0x71, + 0xa0, 0x52, 0xdd, 0x8e, 0x5f, 0xad, 0xae, 0x05, 0xf3, 0x7e, + 0x6e, 0x99, 0x69, 0xfa, 0x86, 0x04, 0xe6, 0x7f, 0xe1, 0xd1, + 0xdc, 0x00, 0x97, 0xd8, 0x6d, 0x5f, 0xc5, 0x5f, 0x48, 0x5f, + 0x85, 0x5d, 0x48, 0x5f, 0x7a, 0xa2, 0xb7, 0xa0, 0xb9, 0x99, + 0xd9, 0x4d, 0xbe, 0x56, 0xd4, 0xca, 0x76, 0xad, 0xf8, 0x9e, + 0xc1, 0xdd, 0x99, 0x0f, 0x7d, 0xb7, 0x3b, 0xa6, 0x75, 0x3f, + 0x7e, 0xb7, 0x75, 0xbf, 0x7b, 0xb7, 0x8a, 0x40, 0x84, 0x48, + 0x6f, 0x69, 0x1f, 0x7b, 0x05, 0xa6, 0xe5, 0xe7, 0xff, 0x7d, + 0x1a, 0x65, 0x1b, 0x25, 0x26, 0xee, 0xce, 0xb0, 0x18, 0x5a, + 0xde, 0xb1, 0x39, 0xc5, 0xde, 0xb1, 0x1b, 0xcc, 0x21, 0x4e, + 0xe4, 0x33, 0xcd, 0xac, 0x03, 0xb1, 0xef, 0xa5, 0xb5, 0x90, + 0x7b, 0xfb, 0x12, 0xf4, 0x7b, 0x35, 0xf6, 0x33, 0x97, 0xb3, + 0x55, 0x29, 0x61, 0xb3, 0x55, 0x31, 0x61, 0xb1, 0x55, 0x31, + 0x9e, 0x4e, 0xaa, 0xce, 0xf5, 0x1b, 0xc8, 0xed, 0x32, 0x8b, + 0xa8, 0x53, 0x22, 0x96, 0x56, 0x3a, 0x40, 0x49, 0x7e, 0x5e, + 0xb5, 0x20, 0x0f, 0x18, 0x86, 0x50, 0x2a, 0x2a, 0x86, 0x50, + 0x2a, 0x3a, 0x79, 0xaf, 0xd5, 0xc5, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x9d, 0xe1, + 0x7a, 0x57, 0x02, 0xc3, 0x96, 0x40, 0xd8, 0x31, 0x97, 0x0e, + 0x76, 0x69, 0xd4, 0xa8, 0x12, 0x56, 0x5b, 0xad, 0x30, 0x74, + 0x1c, 0x8a, 0x30, 0x74, 0x1d, 0x88, 0xcf, 0x8b, 0xe2, 0x77, + 0x77, 0xa3, 0x56, 0x23, 0x21, 0x03, 0xc4, 0xa9, 0x27, 0xdd, + 0x3d, 0x7e, 0x29, 0xa5, 0xca, 0xc6, 0x84, 0x4a, 0xda, 0x5a, + 0xa4, 0xe4, 0xd6, 0xca, 0xa4, 0xe4, 0xd6, 0xca, 0x5b, 0x1b, + 0x29, 0x35, 0xd3, 0x77, 0x5a, 0x14, 0x6a, 0x59, 0xdc, 0xaf, + 0x98, 0xda, 0x97, 0x10, 0x8b, 0xa1, 0x8c, 0xf0, 0xe1, 0x38, + 0xaf, 0xa4, 0x81, 0xa7, 0xce, 0xe4, 0x81, 0xa6, 0x8e, 0xe4, + 0x7e, 0x59, 0x71, 0x1b, 0x03, 0x43, 0x26, 0x3e, 0x29, 0xd6, + 0xff, 0x60, 0x8b, 0x94, 0x72, 0x6c, 0x6c, 0xbe, 0x62, 0xfc, + 0x51, 0xa6, 0x3d, 0x62, 0x48, 0xab, 0x2d, 0xfe, 0x48, 0xab, + 0x2d, 0xfe, 0xb7, 0x54, 0xd2, 0x01, 0x2d, 0xa6, 0xdd, 0xf0, + 0x2a, 0x9a, 0xe3, 0x81, 0x8e, 0xe1, 0xf4, 0x42, 0x21, 0x10, + 0x66, 0x6b, 0xdd, 0xb5, 0x8e, 0x4c, 0x0e, 0x51, 0xcf, 0x72, + 0x0c, 0x71, 0xce, 0x72, 0xf3, 0x8e, 0x31, 0x8d, 0x71, 0x03, + 0xba, 0xc5, 0xc6, 0x42, 0xfe, 0xd4, 0xcc, 0x04, 0xb4, 0xd3, + 0x12, 0x99, 0xc5, 0x99, 0x58, 0x45, 0x3f, 0xe8, 0x99, 0xd9, + 0x1e, 0x59, 0x99, 0xcd, 0x9e, 0xd9, 0x66, 0x32, 0x61, 0x26, + 0x55, 0x01, 0xf8, 0xf1, 0xe3, 0x4e, 0xf6, 0x6d, 0x4e, 0x27, + 0xdf, 0x75, 0xb9, 0x69, 0x06, 0x99, 0x70, 0x84, 0x4a, 0x6a, + 0x36, 0xa6, 0x43, 0x6c, 0x34, 0x84, 0x42, 0x6c, 0xcb, 0x7b, + 0xbd, 0x93, 0x9e, 0x8f, 0xcb, 0xd2, 0xf9, 0x3a, 0xe5, 0x16, + 0x75, 0x34, 0x7b, 0xd0, 0x46, 0x4e, 0xfc, 0x82, 0xea, 0xab, + 0x23, 0x01, 0x4a, 0x39, 0x06, 0x02, 0x4a, 0x39, 0x26, 0x02, + 0xb5, 0xc6, 0xd9, 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x55, 0xfe, 0x07, 0xbc, + 0x1a, 0xd1, 0x1d, 0x62, 0xb7, 0xcb, 0xfb, 0xf2, 0x3a, 0xc4, + 0xa8, 0xb2, 0x90, 0x25, 0x12, 0x25, 0x9e, 0xb4, 0x92, 0x69, + 0x9e, 0xa5, 0x92, 0x69, 0x61, 0x5a, 0x6d, 0x96, 0x85, 0x2c, + 0xb7, 0x43, 0xbb, 0x58, 0x0f, 0xf6, 0x49, 0x10, 0x36, 0x28, + 0xc0, 0xde, 0xec, 0x29, 0x48, 0x46, 0xe7, 0x3f, 0x0c, 0x57, + 0xa6, 0x3b, 0x08, 0x57, 0xa6, 0x3b, 0xf7, 0xa8, 0x59, 0xc4, + 0x56, 0x88, 0x07, 0x77, 0x0d, 0x50, 0x41, 0x93, 0xa3, 0x6b, + 0x26, 0x85, 0x6d, 0xd8, 0xdd, 0x7b, 0x00, 0xd0, 0xa7, 0x19, + 0x8c, 0xa8, 0xc7, 0x89, 0x88, 0xa8, 0xc7, 0x89, 0x77, 0x57, + 0x38, 0x76, 0x7a, 0x99, 0xee, 0x29, 0x21, 0x04, 0x71, 0x8f, + 0x8a, 0x10, 0x3f, 0x6b, 0x7a, 0xe4, 0x8d, 0x74, 0x4f, 0x99, + 0xa1, 0x71, 0x37, 0xcc, 0x1d, 0x60, 0x37, 0xcc, 0x9d, 0x60, + 0xc8, 0x33, 0x62, 0x9f, 0x8c, 0x6c, 0xed, 0xb5, 0xa1, 0xb0, + 0x6a, 0x75, 0xb2, 0x79, 0xac, 0xba, 0x4a, 0x50, 0x9c, 0x08, + 0x34, 0x95, 0x21, 0x0c, 0x70, 0x99, 0xed, 0x8b, 0x70, 0x99, + 0xed, 0x8b, 0x8f, 0x66, 0x12, 0x74, 0xd1, 0x4f, 0x29, 0xd6, + 0x91, 0x18, 0xfb, 0x98, 0xc8, 0xfa, 0x08, 0x4c, 0xd9, 0x5f, + 0x17, 0x7e, 0x2d, 0x81, 0x9b, 0x9c, 0x9c, 0xa3, 0x13, 0x3d, + 0x9c, 0xa3, 0x13, 0x3d, 0x63, 0x5c, 0xec, 0xc2, 0x7c, 0x6f, + 0x6c, 0x95, 0xa4, 0xcf, 0xec, 0x02, 0x94, 0xa6, 0x86, 0xa3, + 0x3f, 0x69, 0x96, 0x3d, 0x1a, 0xdd, 0xd0, 0xf1, 0x10, 0x84, + 0xce, 0x21, 0x10, 0xc5, 0xce, 0x21, 0xef, 0x3a, 0x31, 0xde, + 0x3f, 0xda, 0x87, 0x1f, 0x1e, 0xb6, 0x69, 0x34, 0x95, 0x1c, + 0xd3, 0x67, 0xd6, 0xa0, 0x25, 0x98, 0x26, 0x1a, 0x4c, 0xb2, + 0x02, 0x2a, 0x1f, 0xae, 0x06, 0x2a, 0x1d, 0xaa, 0xf9, 0xd5, + 0xe2, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x89, 0x46, 0x8b, 0x02, 0xaf, 0x78, + 0x3d, 0x9f, 0x52, 0xf3, 0xa6, 0x4a, 0xbd, 0xc3, 0x39, 0x7f, + 0xc6, 0x6c, 0xd2, 0x95, 0xc8, 0x66, 0xf6, 0xfd, 0xc8, 0x66, + 0xf6, 0xfd, 0x37, 0x99, 0x09, 0x02, 0xbe, 0x4b, 0xf3, 0xd7, + 0x43, 0x58, 0x80, 0xf3, 0x0a, 0xeb, 0xd9, 0x24, 0x0e, 0x40, + 0x2b, 0x68, 0x93, 0x12, 0x7f, 0xf6, 0x96, 0x19, 0x2f, 0xd5, + 0x96, 0x13, 0x2d, 0xf6, 0x69, 0xec, 0xd2, 0x09, 0xea, 0xb8, + 0x07, 0x8e, 0x3d, 0xea, 0x71, 0x6b, 0x94, 0x2e, 0xf6, 0x1c, + 0xaf, 0x90, 0xd5, 0xd7, 0x15, 0x78, 0x1c, 0x9d, 0xa5, 0x18, + 0x1e, 0xc9, 0xa5, 0x18, 0x1c, 0xcd, 0x5a, 0xe7, 0xe3, 0x32, + 0xaa, 0x55, 0x62, 0x05, 0x0d, 0x14, 0xa6, 0x2a, 0x55, 0xbb, + 0x99, 0x0e, 0x82, 0x86, 0xe7, 0x4a, 0xf7, 0x92, 0xe9, 0x4b, + 0xa7, 0x71, 0xa9, 0x4b, 0xb7, 0x90, 0xa9, 0x4b, 0x48, 0x6f, + 0x56, 0xb4, 0xe2, 0x3a, 0x02, 0x50, 0xb7, 0xed, 0x73, 0xf0, + 0xee, 0x20, 0x1f, 0xf7, 0x41, 0x9b, 0xbb, 0x16, 0x96, 0x81, + 0xe1, 0x5b, 0xe3, 0x82, 0x82, 0x5e, 0x43, 0x82, 0x83, 0x5e, + 0xbc, 0x7d, 0x7c, 0xa1, 0xb3, 0x34, 0xda, 0x07, 0x6c, 0xdf, + 0xc0, 0x69, 0x43, 0x40, 0xf8, 0x39, 0x54, 0xd2, 0x27, 0xaf, + 0x11, 0x38, 0x3c, 0x06, 0xe0, 0x18, 0xb4, 0xae, 0xc0, 0x18, + 0x34, 0xae, 0x3f, 0xe7, 0xcb, 0x51, 0xe7, 0xca, 0x17, 0x92, + 0xd6, 0x51, 0x00, 0x27, 0xef, 0x54, 0x52, 0x97, 0xc3, 0xac, + 0x98, 0x5b, 0xf0, 0x1f, 0xb9, 0x88, 0xfd, 0x76, 0xd1, 0xa3, + 0xdd, 0x56, 0xd1, 0xa3, 0x22, 0xa9, 0x2e, 0x5c, 0xe6, 0x3b, + 0x8a, 0xeb, 0x1e, 0x1b, 0x31, 0xf4, 0xd1, 0x84, 0x2b, 0x09, + 0xbf, 0x71, 0xab, 0x4c, 0x86, 0x04, 0x64, 0x1b, 0x96, 0x30, + 0xe5, 0x28, 0x86, 0x30, 0xe1, 0x28, 0x79, 0xcf, 0x1e, 0xd7, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0xad, 0x70, 0x07, 0x15, 0x41, 0xe2, 0xda, 0xf8, + 0x5f, 0xfa, 0x29, 0x0b, 0x74, 0xd5, 0x70, 0x2e, 0x25, 0xac, + 0xc1, 0xbe, 0x43, 0x9b, 0xc0, 0x84, 0x47, 0x99, 0xc0, 0x84, + 0xb8, 0x66, 0x3f, 0x7b, 0x86, 0xe6, 0x84, 0x37, 0x13, 0xf4, + 0x73, 0x2b, 0x32, 0x37, 0xa9, 0x86, 0x87, 0x2f, 0x4e, 0xd1, + 0x53, 0x3d, 0xc6, 0x78, 0x53, 0xb7, 0x77, 0x7a, 0x53, 0x37, + 0x77, 0x78, 0xac, 0xc8, 0x88, 0x87, 0x2c, 0x1e, 0xe1, 0xfa, + 0xb6, 0xbb, 0x93, 0x32, 0x33, 0xe2, 0x1a, 0xe0, 0x14, 0x80, + 0xe5, 0xa4, 0x1d, 0x92, 0x2a, 0x11, 0x5c, 0xba, 0x49, 0x29, + 0x5c, 0xb2, 0x49, 0x29, 0xa3, 0x4d, 0xb6, 0xd6, 0x04, 0xd3, + 0x44, 0x0e, 0x03, 0x98, 0x01, 0x01, 0xec, 0xf0, 0xf4, 0xb8, + 0xc3, 0x81, 0x00, 0x66, 0xbc, 0x34, 0x46, 0xcf, 0x51, 0x01, + 0x17, 0x0f, 0xd1, 0x21, 0x16, 0x0f, 0x2e, 0xde, 0xe9, 0xf0, + 0x8e, 0x46, 0x50, 0x4f, 0x38, 0x65, 0x3c, 0xf8, 0x30, 0xc5, + 0x6d, 0x04, 0x18, 0x23, 0x95, 0xb4, 0xb8, 0x9d, 0x9e, 0x9c, + 0x7a, 0xad, 0x94, 0x5b, 0x7a, 0xad, 0x94, 0x99, 0x85, 0x52, + 0x6b, 0x66, 0x5d, 0x7d, 0xbe, 0x84, 0xdf, 0xa9, 0xa8, 0xef, + 0xd6, 0x05, 0x06, 0x34, 0xcc, 0x56, 0x9f, 0x5c, 0x76, 0x96, + 0x25, 0x99, 0x3e, 0x5d, 0xb7, 0x94, 0x3e, 0x5c, 0x37, 0x9c, + 0xc1, 0xa3, 0xc8, 0x63, 0x84, 0x7c, 0x30, 0x1e, 0x2d, 0x81, + 0x43, 0x76, 0xb9, 0xdf, 0x6a, 0x4b, 0x53, 0xd7, 0xb4, 0x83, + 0x11, 0xfb, 0x1b, 0xe5, 0x0b, 0xe3, 0xe3, 0x61, 0x0b, 0xe3, + 0x73, 0x65, 0xf4, 0x1c, 0x8c, 0x9a, 0x42, 0xb0, 0x81, 0xf5, + 0x3f, 0xd6, 0x5e, 0x2b, 0x09, 0xd1, 0x68, 0x78, 0xf6, 0x7c, + 0x34, 0x97, 0xb3, 0x72, 0xf5, 0x7b, 0x82, 0xb8, 0x7c, 0x1d, + 0x82, 0xb8, 0x7c, 0x5d, 0x7d, 0x47, 0x83, 0xa2, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x42, 0x78, 0x38, 0x30, 0xe9, 0x08, 0x72, 0x8b, 0x90, 0xd1, + 0x9b, 0xa8, 0x8d, 0xe8, 0x21, 0xd0, 0x7a, 0xf5, 0x97, 0xe7, + 0xfc, 0xf1, 0x37, 0x39, 0xfc, 0xf1, 0x37, 0x39, 0x03, 0x0e, + 0xc8, 0xc6, 0xa8, 0x3a, 0x07, 0x1f, 0xc9, 0x89, 0x53, 0xa3, + 0x67, 0x07, 0xe8, 0xd0, 0x18, 0x6d, 0x74, 0x14, 0xcf, 0xbd, + 0xd8, 0x4a, 0x27, 0x2e, 0xd8, 0x0e, 0x27, 0x2e, 0xd8, 0x0e, + 0xd8, 0xd1, 0x27, 0xf1, 0x20, 0x4d, 0x61, 0xe8, 0xb8, 0x12, + 0x70, 0xe7, 0x17, 0x55, 0x50, 0x4b, 0x5c, 0x9c, 0x4e, 0xd7, + 0xe5, 0x82, 0x8e, 0x6c, 0xce, 0x22, 0x5e, 0x13, 0xc4, 0x02, + 0x5e, 0x53, 0x3b, 0xfd, 0xa1, 0xac, 0x83, 0xaf, 0x89, 0xf6, + 0x1d, 0x2f, 0x00, 0x69, 0xe9, 0xc2, 0x12, 0xbe, 0xca, 0x32, + 0xbc, 0xd8, 0xcc, 0xb0, 0x18, 0x2f, 0x64, 0xb3, 0x19, 0xa5, + 0x6c, 0xb2, 0x19, 0xa5, 0x93, 0x4d, 0xe6, 0x5a, 0xbf, 0x77, + 0xce, 0x27, 0xb7, 0xd3, 0x92, 0xf1, 0x5a, 0x95, 0x4e, 0x05, + 0x60, 0x79, 0x74, 0xf9, 0x90, 0xf8, 0xb5, 0x6a, 0x16, 0xd9, + 0x35, 0x65, 0x16, 0xd9, 0xb5, 0x65, 0xe9, 0x26, 0x4a, 0x9a, + 0xa8, 0xac, 0x85, 0xb5, 0xe3, 0x29, 0x9a, 0xb9, 0xbc, 0x19, + 0xa6, 0xe1, 0x7e, 0xd0, 0xe6, 0x76, 0xad, 0x7b, 0xeb, 0x09, + 0x5d, 0x51, 0xe3, 0x0e, 0xdd, 0x51, 0xe3, 0x0c, 0x22, 0xae, + 0x1c, 0xf3, 0x3a, 0x2a, 0xb9, 0x43, 0xfd, 0x01, 0x22, 0x4a, + 0x5c, 0xc7, 0xff, 0x62, 0xe6, 0x8c, 0xdd, 0x14, 0x57, 0x22, + 0x1c, 0xff, 0x74, 0x07, 0x0d, 0xf1, 0x74, 0x27, 0x1d, 0xf7, + 0x8b, 0xd8, 0xe2, 0x08, 0x4a, 0x4e, 0x03, 0xa9, 0xa5, 0xff, + 0x80, 0xfa, 0x45, 0x34, 0x7e, 0xde, 0x4c, 0x65, 0x0e, 0x86, + 0xae, 0xc1, 0xe9, 0x00, 0x61, 0xc0, 0x42, 0xc2, 0xe1, 0xe0, + 0x42, 0xc2, 0x1e, 0x1f, 0xbd, 0x3d, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2d, 0xcf, + 0xaf, 0x90, 0x1a, 0xc5, 0x29, 0x00, 0x03, 0x62, 0xfb, 0xdf, + 0x4d, 0x4b, 0xdf, 0x21, 0x43, 0xe1, 0x2b, 0x0a, 0x86, 0x49, + 0xfe, 0x8c, 0x87, 0x49, 0xfe, 0x8c, 0x78, 0xb6, 0x01, 0x73, + 0xf6, 0x3f, 0x6f, 0xd2, 0xe2, 0xf7, 0x78, 0x40, 0xb9, 0xd6, + 0xe0, 0xb4, 0x21, 0x29, 0x0e, 0x8c, 0x83, 0xcc, 0xcf, 0xb2, + 0xa3, 0xd3, 0x4f, 0xb8, 0xa3, 0xd3, 0x4f, 0xb8, 0x5c, 0x2c, + 0xb0, 0x47, 0x42, 0xaf, 0x60, 0x7a, 0x4a, 0x01, 0xf7, 0xea, + 0xdb, 0x4f, 0xf9, 0xd6, 0xd9, 0x75, 0xe1, 0x68, 0xcc, 0x03, + 0xf5, 0x67, 0xb8, 0x95, 0x34, 0x67, 0xd8, 0x15, 0xf4, 0x67, + 0x27, 0xea, 0x0b, 0x98, 0x5c, 0x21, 0x3f, 0x10, 0x7c, 0xf6, + 0x39, 0x60, 0x9c, 0x3c, 0xb2, 0x6b, 0x01, 0x2f, 0x0b, 0x8b, + 0xd4, 0x24, 0xbf, 0x0b, 0x96, 0x0c, 0xab, 0x03, 0x96, 0x0c, + 0xbb, 0x8b, 0x69, 0xf3, 0x44, 0x74, 0xff, 0xec, 0xa7, 0x4e, + 0x06, 0x09, 0x49, 0x8a, 0x3f, 0xc6, 0x0f, 0x2e, 0x55, 0x98, + 0xd2, 0x44, 0x04, 0xae, 0x2e, 0xa0, 0xd4, 0x86, 0x6e, 0xa8, + 0x54, 0x8e, 0x6e, 0xa8, 0xab, 0x71, 0x91, 0x57, 0xeb, 0x8d, + 0xe3, 0x92, 0x9d, 0x42, 0xa5, 0x88, 0x53, 0xd1, 0xfe, 0x8b, + 0x98, 0xef, 0xe3, 0x0b, 0xf6, 0x41, 0x1f, 0xed, 0x50, 0x4b, + 0x9f, 0x5f, 0x50, 0x4b, 0x9f, 0x4d, 0xaf, 0xb4, 0x60, 0xb2, + 0x95, 0xa2, 0x74, 0x96, 0xa2, 0x13, 0xc0, 0x3a, 0x16, 0x37, + 0x67, 0xb5, 0xa6, 0xde, 0x74, 0x1e, 0x26, 0xba, 0x8d, 0x3c, + 0xa4, 0x5f, 0x9f, 0x34, 0xa4, 0x5f, 0x9f, 0x34, 0x5b, 0xa0, + 0x60, 0xcb, 0xd6, 0xd7, 0x42, 0xf8, 0x12, 0x3c, 0x4b, 0xe4, + 0xca, 0xf0, 0xd2, 0x55, 0x6f, 0xfe, 0xa5, 0x6b, 0x64, 0xfd, + 0xc3, 0x1b, 0xe9, 0x77, 0xce, 0x1b, 0xe9, 0x77, 0xce, 0x1b, + 0x16, 0x88, 0x31, 0xe4, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5e, 0xcd, 0xe5, 0x27, + 0x6b, 0x60, 0xe6, 0x30, 0xc4, 0xd6, 0xbb, 0xd8, 0xbe, 0x1d, + 0x9b, 0xa8, 0xde, 0x19, 0xec, 0x31, 0xfe, 0x9c, 0xe4, 0xa3, + 0xfe, 0x1c, 0xe4, 0xa0, 0x01, 0xe3, 0x1b, 0x5f, 0x4e, 0x58, + 0xd8, 0x86, 0x9a, 0xa5, 0xc2, 0x00, 0x86, 0xee, 0x79, 0x65, + 0x24, 0xa4, 0x73, 0xcd, 0xad, 0xba, 0x27, 0x03, 0x06, 0x39, + 0x6b, 0x81, 0x06, 0x39, 0x63, 0x81, 0xf9, 0xc6, 0x9c, 0x7e, + 0x9d, 0xc6, 0xb2, 0xe3, 0xa3, 0xcf, 0xc4, 0x58, 0xd8, 0x61, + 0x76, 0xaa, 0x78, 0x03, 0x07, 0xef, 0xa1, 0x9b, 0xff, 0xf0, + 0xb1, 0x2b, 0x5f, 0xd8, 0xb1, 0x2b, 0x5f, 0xc8, 0x4e, 0xd4, + 0xa0, 0x37, 0x77, 0x6a, 0x53, 0x55, 0x2a, 0xbb, 0xf6, 0x31, + 0x0e, 0x70, 0x60, 0x94, 0x58, 0x33, 0x4d, 0x49, 0xef, 0xea, + 0x71, 0xdf, 0x64, 0x10, 0x12, 0x4b, 0x6e, 0x10, 0x10, 0x4b, + 0x91, 0xef, 0xef, 0xb4, 0xb7, 0xdc, 0xd2, 0x98, 0x57, 0x81, + 0x3c, 0xd2, 0xe2, 0x43, 0xb8, 0xa2, 0xad, 0xa4, 0xf5, 0x08, + 0x19, 0xb7, 0xb2, 0x6d, 0x6b, 0xbc, 0xbb, 0x0e, 0x69, 0xb4, + 0xbb, 0x0e, 0x96, 0x4b, 0x44, 0xf1, 0x45, 0x3c, 0x05, 0x69, + 0x15, 0x18, 0xf9, 0xec, 0x04, 0x36, 0x72, 0x17, 0x43, 0x4d, + 0x2c, 0x88, 0x54, 0x74, 0xdb, 0xf3, 0x60, 0xf4, 0xbd, 0xba, + 0x40, 0x74, 0xbd, 0xaa, 0xbf, 0x8b, 0x42, 0x55, 0xff, 0xf8, + 0x65, 0xd2, 0x7b, 0x20, 0x2d, 0x3b, 0xa6, 0x4b, 0x66, 0x56, + 0xe3, 0xe3, 0x23, 0x87, 0xe0, 0x30, 0x2d, 0xbe, 0x7c, 0x61, + 0x2d, 0xcb, 0x7c, 0x61, 0x2d, 0x8b, 0x83, 0x9e, 0xd2, 0x74, + 0xb6, 0xc4, 0xb4, 0xf7, 0xf0, 0x9c, 0xe5, 0x40, 0x60, 0x9c, + 0x9c, 0x1b, 0xee, 0xe4, 0x8f, 0x88, 0x10, 0xe2, 0x91, 0x31, + 0x50, 0xd5, 0x9c, 0x50, 0x50, 0xd5, 0x9c, 0x10, 0xaf, 0x2a, + 0x63, 0xef, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xd0, 0x71, 0xc5, 0x4a, 0x13, 0x27, + 0xe6, 0x0e, 0x3d, 0x53, 0x17, 0x33, 0x86, 0xf7, 0x72, 0xd8, + 0x96, 0x12, 0xc1, 0x58, 0x52, 0x5a, 0xd1, 0x98, 0x52, 0x5b, + 0xd1, 0x98, 0xad, 0xa4, 0x2e, 0x67, 0xf3, 0xcd, 0x40, 0x44, + 0xbd, 0x28, 0x21, 0x89, 0xa6, 0xb4, 0x9a, 0xb8, 0x30, 0xae, + 0xe8, 0xee, 0x2d, 0x39, 0xe2, 0x2e, 0x6f, 0x7c, 0xf1, 0x5c, + 0x6f, 0x78, 0xe3, 0x6e, 0x90, 0x87, 0x1c, 0x91, 0xc9, 0x8d, + 0xc2, 0x64, 0x2b, 0x03, 0x86, 0x5c, 0x94, 0xa8, 0xe6, 0x7a, + 0xf7, 0x40, 0x86, 0xbe, 0xbb, 0x61, 0x1d, 0xe0, 0x99, 0xf9, + 0xbd, 0xb0, 0x99, 0xf9, 0xbd, 0xb0, 0x66, 0x06, 0x42, 0x4f, + 0xc6, 0xb5, 0xef, 0xf2, 0xcb, 0x0c, 0xae, 0x1b, 0x09, 0x8a, + 0x03, 0xcd, 0xb3, 0x7b, 0x22, 0x02, 0x76, 0xe8, 0xab, 0x32, + 0x74, 0xcc, 0x22, 0x26, 0x76, 0xcc, 0x22, 0x22, 0x89, 0x33, + 0xdd, 0xdd, 0xcb, 0xca, 0xe4, 0x24, 0x0f, 0x9b, 0x44, 0xb0, + 0x0c, 0xec, 0x1d, 0x24, 0xe5, 0xc1, 0x47, 0xb3, 0xbc, 0x60, + 0xad, 0x51, 0xe2, 0xe2, 0x2d, 0x5d, 0xe0, 0xe2, 0x2d, 0x55, + 0x1f, 0x1d, 0xd2, 0xaa, 0x78, 0x65, 0x42, 0x20, 0xd9, 0xc6, + 0x7f, 0x58, 0x02, 0x8b, 0x8b, 0x54, 0x64, 0xea, 0xc5, 0x56, + 0x16, 0x04, 0x75, 0xf0, 0x74, 0x9c, 0xcc, 0x80, 0x54, 0x8e, + 0xc5, 0xf0, 0xab, 0x71, 0x3a, 0x0f, 0x31, 0x9e, 0x75, 0x1e, + 0x41, 0x8f, 0x48, 0x84, 0x25, 0x09, 0xd0, 0x30, 0xc0, 0x2f, + 0xda, 0xba, 0x0e, 0x33, 0xb8, 0xea, 0x46, 0x2d, 0xf4, 0x73, + 0x46, 0x2f, 0xf4, 0xf3, 0xb9, 0xd0, 0x0b, 0x0c, 0x2a, 0x2c, + 0xab, 0x38, 0x2c, 0xa1, 0x4e, 0xac, 0x04, 0x56, 0xf1, 0x8a, + 0xca, 0xc3, 0x14, 0xb6, 0x15, 0x83, 0x73, 0x6d, 0xb7, 0x91, + 0x18, 0x47, 0x97, 0x93, 0x10, 0x47, 0x68, 0x6c, 0xef, 0xb8, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x5d, 0x80, 0xb1, 0x27, 0xb4, 0xb2, 0x4b, 0x0c, + 0x6a, 0xaa, 0x6d, 0x8a, 0xf7, 0xae, 0xb4, 0x61, 0xb2, 0x81, + 0x47, 0x95, 0xf4, 0x8f, 0xc0, 0x83, 0xf4, 0x8b, 0xc0, 0x87, + 0x0b, 0x74, 0x3f, 0x78, 0x8b, 0xc9, 0x6d, 0x3d, 0x54, 0x1e, + 0x54, 0xdd, 0x68, 0x2b, 0xf7, 0xb2, 0x86, 0x79, 0xdb, 0x0a, + 0xdd, 0x1b, 0x4a, 0xb4, 0xe0, 0x06, 0xa7, 0x14, 0xe0, 0x0e, + 0xa7, 0x14, 0x1f, 0xf1, 0x58, 0xeb, 0x66, 0xf8, 0x4f, 0x13, + 0x43, 0x34, 0x4c, 0x27, 0x1d, 0xee, 0xe6, 0x19, 0x14, 0xc8, + 0x9d, 0x41, 0x68, 0x71, 0xba, 0xe5, 0x7f, 0x15, 0xac, 0xa9, + 0x7d, 0x35, 0xae, 0xe9, 0x82, 0xca, 0x51, 0x16, 0xe0, 0xaf, + 0x12, 0x83, 0xac, 0x0c, 0x63, 0x31, 0x0f, 0xb9, 0x00, 0x51, + 0x0e, 0x27, 0xac, 0x84, 0x12, 0x1e, 0x02, 0x4e, 0x0c, 0x2f, + 0x61, 0xce, 0x0c, 0x2f, 0x61, 0x4e, 0xf3, 0xd0, 0x9e, 0xb1, + 0x7c, 0x5b, 0xcb, 0xde, 0x06, 0x2c, 0x19, 0xeb, 0x19, 0x2e, + 0x71, 0x96, 0xc4, 0xac, 0x44, 0x03, 0x5d, 0x1b, 0xb6, 0xae, + 0x45, 0xbe, 0x97, 0x8f, 0x45, 0x3e, 0x97, 0x8f, 0xba, 0xc1, + 0x68, 0x70, 0x46, 0x67, 0xfa, 0x4b, 0x16, 0xd1, 0x80, 0x61, + 0x87, 0xdd, 0x7c, 0x75, 0x0b, 0xe0, 0x48, 0xf4, 0xa6, 0x0b, + 0xe1, 0x02, 0x0b, 0x8a, 0x61, 0x40, 0x8f, 0x8b, 0x61, 0x40, + 0x70, 0x74, 0x9e, 0xbf, 0x7b, 0xfb, 0xcb, 0x91, 0xf4, 0x67, + 0x6c, 0x57, 0xf6, 0x16, 0x4f, 0x75, 0x42, 0x28, 0xbf, 0x0e, + 0x62, 0x69, 0x70, 0xba, 0x4a, 0x2a, 0x60, 0xa0, 0x4a, 0x28, + 0x60, 0xa2, 0xb5, 0xd7, 0x9f, 0x5d, 0xde, 0x3d, 0x13, 0x11, + 0xb6, 0xad, 0x07, 0xf7, 0x65, 0xbf, 0xb0, 0xb6, 0xd6, 0x10, + 0x28, 0xf7, 0x9b, 0x62, 0x0b, 0x48, 0x53, 0xb4, 0x2b, 0x33, + 0xd3, 0xb4, 0x2b, 0x73, 0x2c, 0x4b, 0xd4, 0x8c, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xbb, 0x06, 0xf8, 0xf4, 0xe2, 0x7f, 0x58, 0xb0, 0x4c, 0x95, + 0xc4, 0x15, 0x18, 0xd4, 0x3c, 0xd2, 0x2f, 0xe7, 0x0d, 0x13, + 0x19, 0x67, 0x0f, 0x50, 0x39, 0x67, 0x0c, 0x50, 0xc6, 0x98, + 0xf3, 0xaf, 0xca, 0x03, 0x23, 0x31, 0x22, 0xf6, 0xe4, 0xc6, + 0x43, 0x06, 0xd3, 0x04, 0x83, 0xf9, 0x2b, 0xe0, 0x5a, 0x80, + 0x4c, 0x36, 0x12, 0xf0, 0xc1, 0x77, 0x12, 0xd1, 0xc9, 0x77, + 0xed, 0x2e, 0x36, 0x88, 0xe9, 0x74, 0x24, 0xbb, 0x6a, 0x18, + 0x27, 0x66, 0xfb, 0xab, 0x3c, 0x45, 0xef, 0xba, 0x9c, 0x13, + 0xe7, 0xe3, 0xe2, 0x8e, 0x62, 0x97, 0x76, 0x03, 0x62, 0xb3, + 0xf6, 0x03, 0x9d, 0x4c, 0x09, 0xfc, 0x3b, 0x18, 0x69, 0x09, + 0xc9, 0xb5, 0x86, 0x0b, 0xb5, 0xc5, 0xe9, 0x2f, 0xd8, 0x24, + 0x62, 0x33, 0x91, 0x04, 0x64, 0xf8, 0x90, 0x01, 0x14, 0xfa, + 0x90, 0x01, 0x20, 0xfa, 0x6f, 0xfe, 0xdf, 0x05, 0x8a, 0x3f, + 0x8b, 0x7e, 0xad, 0x86, 0xaf, 0x6e, 0x24, 0x8a, 0x72, 0xc7, + 0x72, 0xea, 0x7b, 0xf6, 0x20, 0x25, 0x44, 0xb7, 0xa2, 0xb0, + 0x76, 0xc7, 0xa2, 0xa0, 0x76, 0xd7, 0x5d, 0x5f, 0x89, 0x28, + 0x3e, 0xa2, 0x76, 0x94, 0x65, 0x98, 0x57, 0xe7, 0x89, 0xb6, + 0x26, 0x29, 0x9c, 0xfb, 0xb6, 0xa6, 0xa2, 0x7e, 0xfa, 0xe2, + 0xb0, 0xf4, 0xe9, 0xe6, 0xb0, 0xf4, 0xe9, 0xe6, 0x4f, 0x0b, + 0x16, 0x19, 0x4d, 0xa5, 0x55, 0xd0, 0x1f, 0x4d, 0x7b, 0x8d, + 0xda, 0x14, 0xd1, 0xa6, 0x68, 0x62, 0xbb, 0xe5, 0x22, 0xba, + 0x2c, 0xc5, 0xc8, 0x7e, 0x7c, 0x49, 0xc8, 0x7a, 0x7c, 0x4d, + 0x37, 0x85, 0x83, 0xb2, 0xc3, 0x5e, 0x01, 0xcd, 0x9c, 0x11, + 0xf3, 0x2d, 0x7a, 0xee, 0x11, 0x31, 0xdc, 0xab, 0x44, 0xc4, + 0x51, 0x14, 0xb3, 0x77, 0x48, 0xb4, 0x45, 0x01, 0x58, 0xb4, + 0x45, 0x41, 0xa7, 0x4b, 0xba, 0xbe, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xbe, 0xc5, + 0x05, 0xaf, 0x59, 0x37, 0xee, 0x47, 0xeb, 0x14, 0xa1, 0x64, + 0xd2, 0x71, 0x6a, 0x0a, 0x66, 0x20, 0x65, 0xdf, 0xc6, 0x29, + 0x75, 0x22, 0x46, 0x21, 0x75, 0x26, 0xb9, 0xde, 0x8a, 0xd9, + 0x70, 0x38, 0x0c, 0x91, 0x5f, 0x36, 0x4e, 0xfd, 0x6f, 0x1f, + 0x8b, 0x2d, 0x97, 0xe7, 0x26, 0x1e, 0x06, 0x4f, 0x7c, 0xbe, + 0x02, 0xe7, 0xb5, 0xae, 0x82, 0xe7, 0xb4, 0xbe, 0x7d, 0x18, + 0x4b, 0x41, 0xed, 0xf3, 0x96, 0x19, 0xef, 0x3d, 0x82, 0xea, + 0xd6, 0x41, 0xb4, 0x36, 0xd4, 0x62, 0x5b, 0xf6, 0x85, 0xaf, + 0x52, 0xf8, 0xae, 0x61, 0x5e, 0xa7, 0xac, 0x61, 0x5e, 0xa7, + 0x53, 0x9e, 0xa1, 0x58, 0x63, 0x0e, 0x62, 0xd3, 0x65, 0x9c, + 0x9e, 0x88, 0x23, 0xdd, 0xa4, 0xc0, 0x8c, 0x63, 0xdd, 0xd9, + 0x45, 0xde, 0x60, 0x0f, 0xd6, 0x46, 0x02, 0xa3, 0x54, 0x4e, + 0x02, 0xa3, 0xab, 0xb1, 0xfd, 0x5c, 0xef, 0x16, 0x95, 0xbd, + 0x36, 0x9d, 0x3a, 0xd2, 0x62, 0x0c, 0x58, 0x20, 0x86, 0x9f, + 0x1d, 0xb8, 0xbf, 0x75, 0x78, 0xa6, 0x17, 0x7c, 0x59, 0x21, + 0x97, 0x7c, 0x59, 0x25, 0x68, 0x83, 0xa6, 0xda, 0x5f, 0xd0, + 0x64, 0xf4, 0x18, 0x66, 0x4e, 0xbf, 0x5b, 0xfc, 0x3f, 0x39, + 0xbe, 0xf4, 0x88, 0xe1, 0xe4, 0x44, 0x5a, 0xd6, 0xf7, 0x67, + 0x1a, 0x12, 0xf7, 0x44, 0x1a, 0x12, 0x08, 0xbb, 0xe5, 0xed, + 0xbc, 0xf8, 0xe7, 0x64, 0xf8, 0xf0, 0xa5, 0xc5, 0x00, 0xc5, + 0xf5, 0x39, 0x05, 0x8a, 0xc1, 0x6d, 0x29, 0xfd, 0x8e, 0x4d, + 0x68, 0x94, 0xc7, 0x6d, 0x28, 0x94, 0xc7, 0x6d, 0xd7, 0x6b, + 0x38, 0x92, 0x5a, 0xad, 0x07, 0xc2, 0x9b, 0x86, 0xe7, 0x02, + 0x5f, 0x10, 0xef, 0x0d, 0xe7, 0xf2, 0x98, 0x78, 0x3d, 0xe8, + 0xe4, 0x0f, 0x65, 0xc8, 0xd0, 0x7b, 0x65, 0xc8, 0xd0, 0x5b, + 0x9a, 0x37, 0x2f, 0xa4, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd0, 0xce, 0xb3, 0x2b, + 0x6c, 0xd6, 0x8b, 0x4e, 0xf2, 0xab, 0xeb, 0x05, 0xb0, 0xe7, + 0xcc, 0xd2, 0x15, 0xfe, 0x97, 0x4d, 0x86, 0xbe, 0xdf, 0xcd, + 0x86, 0xbe, 0xcf, 0xcd, 0x79, 0x41, 0x30, 0x32, 0x9c, 0xa9, + 0x61, 0x00, 0x3e, 0x63, 0xb4, 0x5b, 0x38, 0xf2, 0x36, 0x88, + 0x07, 0xb6, 0x64, 0x45, 0x92, 0xe4, 0x4d, 0xa2, 0x00, 0xb7, + 0x4d, 0x2c, 0x00, 0xb7, 0x4d, 0xac, 0xff, 0x48, 0xb2, 0x53, + 0xcf, 0x72, 0x93, 0xb4, 0x91, 0xbc, 0x1b, 0x03, 0x01, 0x7a, + 0xf9, 0x37, 0x06, 0xcf, 0x5c, 0x8e, 0x2f, 0xc9, 0x76, 0xb3, + 0x18, 0x7a, 0x76, 0xfc, 0x18, 0xba, 0x76, 0xbe, 0xe7, 0x05, + 0x89, 0x41, 0x6a, 0x72, 0x3f, 0xb8, 0x16, 0xfd, 0xf3, 0xdc, + 0xe4, 0x97, 0xbc, 0xcc, 0x12, 0x9b, 0x07, 0x02, 0xaf, 0xd9, + 0xdb, 0x92, 0x8d, 0x59, 0xc7, 0x06, 0x8d, 0xd9, 0xc7, 0x06, + 0x72, 0x26, 0x38, 0xf9, 0xcd, 0x73, 0xe4, 0x35, 0x7a, 0x80, + 0xf9, 0xfb, 0x09, 0x53, 0xe6, 0x44, 0x54, 0x9c, 0x24, 0x13, + 0xc8, 0xe3, 0x82, 0xe1, 0x48, 0xca, 0x94, 0x63, 0x48, 0xca, + 0x84, 0x63, 0xb7, 0x35, 0x7b, 0x9c, 0x41, 0xbb, 0x1b, 0xf7, + 0x2c, 0xfe, 0x1e, 0x13, 0x65, 0x39, 0xde, 0x26, 0x46, 0x50, + 0xf2, 0xa1, 0x00, 0x0b, 0xd5, 0x79, 0x54, 0x43, 0x15, 0x2a, + 0x44, 0x43, 0x15, 0x6b, 0xbb, 0xbc, 0xea, 0x94, 0xa3, 0x44, + 0xb6, 0xc5, 0x53, 0xaa, 0xcd, 0xa0, 0xe3, 0x06, 0xeb, 0x20, + 0x73, 0xc4, 0x45, 0x0d, 0x1a, 0xba, 0xda, 0x20, 0xd3, 0xe2, + 0xba, 0x64, 0xd3, 0xe2, 0xba, 0x20, 0x2c, 0x1d, 0x45, 0xdf, + 0x34, 0x47, 0x05, 0x41, 0x1c, 0xb3, 0x04, 0x02, 0x19, 0xa1, + 0xa4, 0x04, 0x7f, 0x75, 0xc3, 0xc7, 0xf9, 0xaf, 0xac, 0x8f, + 0xd9, 0xf3, 0x4f, 0xb7, 0xd9, 0xf3, 0x4f, 0x97, 0x26, 0x0c, + 0xb0, 0x68, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x13, 0xb9, 0xd8, 0x0b, 0xa9, 0xaf, + 0x83, 0x19, 0xb5, 0xea, 0x75, 0x81, 0xb1, 0xd3, 0x52, 0x95, + 0x60, 0x4b, 0x18, 0xde, 0x74, 0x9f, 0x98, 0x9d, 0x74, 0xdf, + 0x18, 0x9d, 0x8b, 0x20, 0xe7, 0x62, 0x82, 0x65, 0xb3, 0x18, + 0x0c, 0xa4, 0x3b, 0xfb, 0x39, 0xff, 0xe7, 0x47, 0x7b, 0x1d, + 0xb6, 0xfe, 0x7b, 0xb3, 0xa6, 0xda, 0x53, 0xb5, 0x66, 0xdb, + 0x5b, 0xb5, 0x66, 0xdb, 0xa4, 0x4a, 0x99, 0x24, 0x01, 0x3f, + 0xf3, 0xc0, 0x60, 0x70, 0x14, 0xd2, 0xe0, 0xf8, 0xc4, 0x31, + 0xc9, 0x20, 0xc1, 0x69, 0x35, 0x30, 0x56, 0xc9, 0x84, 0x28, + 0x4d, 0x4a, 0x8c, 0x28, 0x4d, 0x4a, 0x73, 0xd7, 0xb2, 0xb5, + 0x3f, 0x6e, 0x45, 0xd9, 0xa0, 0x54, 0x23, 0x06, 0x23, 0xcd, + 0x8f, 0x76, 0x39, 0x7d, 0xa7, 0x4d, 0x25, 0x90, 0x1e, 0x6b, + 0x21, 0xdf, 0xf3, 0x29, 0x21, 0xd7, 0xfb, 0x69, 0xde, 0x28, + 0x04, 0x96, 0x87, 0x17, 0xd5, 0x68, 0x9c, 0x09, 0x04, 0xce, + 0x1c, 0xc2, 0x2e, 0xed, 0xe5, 0xe2, 0xf5, 0x61, 0x52, 0x34, + 0x4f, 0x27, 0x5a, 0x66, 0x43, 0x21, 0x7a, 0x66, 0x43, 0x21, + 0x85, 0x99, 0xbc, 0xde, 0xb3, 0x1a, 0x43, 0x87, 0x0a, 0xd3, + 0xda, 0xc4, 0xa9, 0xfd, 0x54, 0xd3, 0x60, 0xa0, 0x2c, 0x41, + 0x75, 0x74, 0x20, 0x25, 0x68, 0x68, 0x2c, 0x03, 0x68, 0x60, + 0x2c, 0x03, 0x97, 0x9f, 0xd3, 0xfc, 0xec, 0x2a, 0x7c, 0x2e, + 0x7b, 0x70, 0x1b, 0x98, 0xc6, 0xa7, 0xd2, 0x32, 0xbd, 0x35, + 0xff, 0x14, 0x27, 0x7d, 0x1a, 0x50, 0x8e, 0x61, 0x3e, 0x50, + 0x8f, 0x61, 0x3e, 0x50, 0x70, 0x9e, 0xc1, 0xaf, 0x06, 0x1f, + 0xe1, 0xf8, 0x97, 0xab, 0x39, 0x0c, 0x51, 0x48, 0x88, 0x2a, + 0xa4, 0xdf, 0x5c, 0x42, 0x5e, 0xee, 0x8a, 0xec, 0x5c, 0xef, + 0xcc, 0xdc, 0x5c, 0xef, 0xcc, 0xdc, 0xa3, 0x10, 0x33, 0x23, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00 +}; + diff --git a/neureka/pointwise/Makefile b/neureka/pointwise/Makefile new file mode 100644 index 0000000..ad8dc3b --- /dev/null +++ b/neureka/pointwise/Makefile @@ -0,0 +1,3 @@ +include ../app/Makefile + +STIM_DIR := ../pointwise/ \ No newline at end of file diff --git a/neureka/pointwise/inc/bias.h b/neureka/pointwise/inc/bias.h new file mode 100644 index 0000000..f28d0f5 --- /dev/null +++ b/neureka/pointwise/inc/bias.h @@ -0,0 +1,9 @@ +#ifndef __BIAS_H__ +#define __BIAS_H__ + +#include + +#define BIAS_SIZE (39) +extern int32_t bias[BIAS_SIZE]; + +#endif // __BIAS_H__ diff --git a/neureka/pointwise/inc/input.h b/neureka/pointwise/inc/input.h new file mode 100644 index 0000000..985f9a9 --- /dev/null +++ b/neureka/pointwise/inc/input.h @@ -0,0 +1,9 @@ +#ifndef __INPUT_H__ +#define __INPUT_H__ + +#include + +#define INPUT_SIZE (8995) +extern uint8_t input[INPUT_SIZE]; + +#endif // __INPUT_H__ diff --git a/neureka/pointwise/inc/layer_conf.h b/neureka/pointwise/inc/layer_conf.h new file mode 100644 index 0000000..e5e4812 --- /dev/null +++ b/neureka/pointwise/inc/layer_conf.h @@ -0,0 +1,42 @@ +#ifndef __LAYER_CONF_H__ +#define __LAYER_CONF_H__ + +#define TEST_NAME "test" +#define INPUT_HEIGHT (7) +#define INPUT_WIDTH (5) +#define INPUT_CHANNEL (257) +#define INPUT_SIGNED (0) +#define INPUT_BITS (8) + +#define OUTPUT_HEIGHT (7) +#define OUTPUT_WIDTH (5) +#define OUTPUT_CHANNEL (39) +#define OUTPUT_BITS (8) + +#define WEIGHT_HEIGHT (1) +#define WEIGHT_WIDTH (1) +#define WEIGHT_CHANNEL_IN (257) +#define WEIGHT_CHANNEL_OUT (39) +#define WEIGHT_BITS (8) +#define WEIGHT_OFFSET (-128) + +#define SCALE_BITS (8) + +#define BIAS_BITS (32) + +#define PADDING_TOP (0) +#define PADDING_BOTTOM (0) +#define PADDING_LEFT (0) +#define PADDING_RIGHT (0) +#define PADDING_VALUE (0) + +#define STRIDE_HEIGHT (1) +#define STRIDE_WIDTH (1) + +#define GROUPS (1) +#define OUTSHIFT (12) +#define HAS_NORM_QUANT (1) +#define HAS_BIAS (1) +#define HAS_RELU (1) + +#endif // __LAYER_CONF_H__ diff --git a/neureka/pointwise/inc/output.h b/neureka/pointwise/inc/output.h new file mode 100644 index 0000000..68c6c11 --- /dev/null +++ b/neureka/pointwise/inc/output.h @@ -0,0 +1,14 @@ +#ifndef __OUTPUT_H__ +#define __OUTPUT_H__ + +#include + +#define OUTPUT_SIZE (1365) +extern uint8_t output[OUTPUT_SIZE]; + +#define GOLDEN_OUTPUT_SIZE (1365) +extern uint8_t golden_output[GOLDEN_OUTPUT_SIZE]; + +int check_output(); + +#endif // __OUTPUT_H__ diff --git a/neureka/pointwise/inc/scale.h b/neureka/pointwise/inc/scale.h new file mode 100644 index 0000000..67dea26 --- /dev/null +++ b/neureka/pointwise/inc/scale.h @@ -0,0 +1,9 @@ +#ifndef __SCALE_H__ +#define __SCALE_H__ + +#include + +#define SCALE_SIZE (39) +extern uint8_t scale[SCALE_SIZE]; + +#endif // __SCALE_H__ diff --git a/neureka/pointwise/inc/weight.h b/neureka/pointwise/inc/weight.h new file mode 100644 index 0000000..ba39fae --- /dev/null +++ b/neureka/pointwise/inc/weight.h @@ -0,0 +1,9 @@ +#ifndef __WEIGHT_H__ +#define __WEIGHT_H__ + +#include + +#define WEIGHT_SIZE (11232) +extern uint8_t weight[WEIGHT_SIZE]; + +#endif // __WEIGHT_H__ diff --git a/neureka/pointwise/src/bias.c b/neureka/pointwise/src/bias.c new file mode 100644 index 0000000..55b2013 --- /dev/null +++ b/neureka/pointwise/src/bias.c @@ -0,0 +1,10 @@ +#include "bias.h" + +#define BIAS_SIZE (39) +PI_L1 int32_t bias[BIAS_SIZE] = { + 0x36dfa, -0xec5f, 0x3ee6f, -0x23173, -0xb8b1, -0x236a3, -0x3fe84, 0x1285e, 0x9a43, -0xfb97, + 0x396a7, -0x16b65, 0x5527, 0xb8f, 0xe316, 0x3c2e1, 0x2bf5e, -0x1adf0, -0x486, 0x1ed7d, + 0x3a88e, 0x8e46, -0x3e8ef, 0xcbee, 0x65e5, 0x2426b, 0x298f4, 0x205dc, 0x2598, -0x2151c, + 0x390ef, -0x2318f, 0x1bc1, 0xe222, 0x1a09e, -0x1f3fd, 0x3d956, -0x12e4d, 0x33ee0 +}; + diff --git a/neureka/pointwise/src/input.c b/neureka/pointwise/src/input.c new file mode 100644 index 0000000..b6afde2 --- /dev/null +++ b/neureka/pointwise/src/input.c @@ -0,0 +1,906 @@ +#include "input.h" + +#define INPUT_SIZE (8995) +PI_L1 uint8_t input[INPUT_SIZE] = { + 0xc2, 0xe1, 0x4d, 0x2d, 0x90, 0xf3, 0x32, 0xc8, 0x6d, 0x7a, + 0x72, 0x92, 0xc8, 0x2d, 0x28, 0x1b, 0x48, 0xe5, 0x2c, 0x55, + 0x62, 0x70, 0xd0, 0x6e, 0x3c, 0x2b, 0x77, 0x12, 0xdf, 0x21, + 0x02, 0x62, 0xc1, 0xae, 0x33, 0x74, 0xd1, 0xbd, 0x81, 0x35, + 0xb8, 0xf3, 0x0b, 0xb3, 0x5f, 0x10, 0x9c, 0xe5, 0x2d, 0x8d, + 0xa2, 0x92, 0x94, 0xbb, 0x8e, 0xda, 0x96, 0xf8, 0xe2, 0x45, + 0x7f, 0xb6, 0x04, 0x30, 0x4b, 0x00, 0x06, 0x95, 0x77, 0x44, + 0x1f, 0x66, 0xaf, 0x5d, 0x95, 0xb9, 0x4e, 0xcd, 0xde, 0x94, + 0x7f, 0x2e, 0x93, 0x3b, 0xec, 0x87, 0x51, 0xd8, 0x96, 0x02, + 0x91, 0x96, 0x18, 0x23, 0x4c, 0xd1, 0x04, 0x66, 0xde, 0xf6, + 0xc6, 0x32, 0xec, 0x4c, 0x59, 0x75, 0xd6, 0x30, 0x17, 0xc5, + 0xe6, 0xc4, 0xe7, 0x8e, 0x65, 0x02, 0x1d, 0x3a, 0xd9, 0x3c, + 0x8f, 0xe2, 0xf4, 0xcc, 0x88, 0xc6, 0xea, 0xc3, 0xaf, 0x4b, + 0xb6, 0x48, 0x01, 0xaf, 0x0f, 0xfa, 0xdc, 0x74, 0x1c, 0x29, + 0x31, 0xf8, 0x9c, 0x54, 0xa4, 0xf0, 0xb5, 0xbe, 0xe4, 0x43, + 0xaf, 0x5a, 0x62, 0x55, 0x69, 0x89, 0x53, 0x0c, 0xc7, 0x1a, + 0x70, 0xb0, 0x1f, 0xe7, 0x91, 0x60, 0x08, 0x51, 0xd8, 0xc6, + 0x8e, 0x37, 0x48, 0x64, 0x57, 0x90, 0xef, 0x85, 0xb6, 0x1d, + 0xc9, 0x95, 0x47, 0x9f, 0xd8, 0xe3, 0xd0, 0x77, 0x1a, 0x59, + 0x1c, 0x4f, 0x1e, 0x65, 0x08, 0xba, 0x3a, 0xc4, 0xd0, 0x29, + 0x98, 0x72, 0x2f, 0x54, 0x3f, 0x2d, 0xd0, 0x00, 0xbd, 0x4d, + 0xad, 0x8a, 0x99, 0xe8, 0x75, 0x42, 0x32, 0xc7, 0x01, 0xf3, + 0xf9, 0xbe, 0x66, 0x2a, 0x77, 0x93, 0x90, 0x69, 0x55, 0x34, + 0x6e, 0x48, 0x1a, 0x9b, 0xc6, 0xbb, 0x35, 0x5d, 0x64, 0xd8, + 0xe0, 0x86, 0x1a, 0x2f, 0xa1, 0x5f, 0xd9, 0x07, 0xb8, 0x42, + 0x49, 0xac, 0x6c, 0x0f, 0xdd, 0x27, 0x69, 0x36, 0x43, 0x14, + 0x99, 0x7f, 0x01, 0xdd, 0xad, 0x30, 0xf2, 0x46, 0xa4, 0x5b, + 0x74, 0xed, 0xac, 0x90, 0xbb, 0x75, 0xd1, 0x64, 0xe1, 0x5d, + 0x3d, 0xf1, 0x29, 0x6d, 0x44, 0xb2, 0x4a, 0x39, 0x8c, 0x9b, + 0x25, 0xa2, 0x89, 0x5c, 0x14, 0xba, 0xd9, 0xa0, 0x0b, 0x22, + 0x84, 0xfb, 0xd5, 0xe9, 0xe2, 0x42, 0x0c, 0xe8, 0x5d, 0x0a, + 0x99, 0xf7, 0x61, 0x35, 0xe5, 0x7b, 0x8c, 0x2c, 0x38, 0x1a, + 0xca, 0x94, 0x66, 0xae, 0x45, 0xa9, 0x4c, 0x52, 0x18, 0x1f, + 0x08, 0xf8, 0xfd, 0x9c, 0x03, 0xf2, 0xfb, 0xfe, 0xa9, 0x55, + 0xae, 0x69, 0x2c, 0x93, 0xd7, 0x8d, 0x2f, 0x63, 0xf7, 0x17, + 0x37, 0xe3, 0xeb, 0x0f, 0x2e, 0x2b, 0x33, 0x53, 0xcc, 0xfb, + 0x4d, 0x4f, 0x7f, 0x85, 0xcf, 0x16, 0x34, 0x88, 0x2d, 0x44, + 0x58, 0xda, 0x76, 0xca, 0xc3, 0xa8, 0x45, 0xc7, 0x9a, 0x25, + 0xd8, 0x59, 0x83, 0x5b, 0x21, 0xcd, 0x00, 0x2b, 0xcc, 0xa7, + 0x24, 0x45, 0x98, 0x02, 0x64, 0x7d, 0x18, 0x19, 0xe0, 0x03, + 0x0d, 0xc9, 0x1a, 0x5b, 0x29, 0x61, 0x46, 0xb4, 0x54, 0xe4, + 0x20, 0xb0, 0xcd, 0xa3, 0xf6, 0xb9, 0x43, 0x91, 0xcc, 0xc8, + 0x2b, 0xea, 0x49, 0xce, 0x83, 0x12, 0xb6, 0xc2, 0x38, 0x36, + 0x0b, 0x71, 0x22, 0x72, 0x78, 0x77, 0x01, 0xf0, 0xd3, 0x74, + 0x53, 0x70, 0x19, 0xf8, 0xac, 0x70, 0xf1, 0xd0, 0xdc, 0xea, + 0x71, 0x3b, 0xa8, 0xfc, 0x12, 0x3b, 0x06, 0x5c, 0x7d, 0x21, + 0x6e, 0x1b, 0xbf, 0xea, 0x79, 0x84, 0x19, 0x42, 0xf3, 0xbd, + 0xc1, 0x0e, 0xf8, 0x31, 0x3c, 0xfc, 0x34, 0x7f, 0xf6, 0x34, + 0xad, 0x0d, 0x01, 0x19, 0xa0, 0x61, 0x1e, 0xe8, 0xc6, 0x56, + 0x42, 0xf5, 0xb2, 0xa4, 0x90, 0x6e, 0x59, 0x3f, 0x89, 0x92, + 0x8d, 0x5d, 0x17, 0x69, 0x41, 0x19, 0x17, 0xca, 0xbd, 0x5b, + 0xb1, 0x4f, 0xd7, 0xcb, 0xe9, 0x68, 0xc3, 0x38, 0x5a, 0x5c, + 0x7f, 0x0b, 0xe9, 0xf3, 0x6e, 0x20, 0xcd, 0xca, 0x2d, 0x95, + 0xec, 0x85, 0xf0, 0xcd, 0xd4, 0xbd, 0x1f, 0xe5, 0xda, 0xf2, + 0x1b, 0xca, 0x43, 0x2e, 0x15, 0x06, 0xba, 0x29, 0x8e, 0xc0, + 0x99, 0x5e, 0x0f, 0x4f, 0x47, 0x41, 0xac, 0x69, 0xb4, 0xcd, + 0x00, 0x10, 0x21, 0xfc, 0x6b, 0xe2, 0x3e, 0x9b, 0xc6, 0xaa, + 0x5d, 0x8c, 0x28, 0x54, 0x36, 0xdc, 0x6a, 0xbc, 0xa6, 0x8d, + 0xf2, 0x07, 0x3b, 0x1c, 0x33, 0x05, 0x82, 0xd4, 0xaa, 0x5c, + 0xd4, 0xc6, 0x8d, 0x2d, 0x1e, 0x2c, 0xe4, 0xed, 0x12, 0x75, + 0x14, 0xbb, 0xf6, 0x04, 0x21, 0xc6, 0xdd, 0xc6, 0x52, 0xd5, + 0x71, 0x8b, 0x83, 0x0b, 0x89, 0x41, 0xe3, 0x36, 0x12, 0xef, + 0xdd, 0xce, 0x03, 0xdd, 0x7e, 0xef, 0x93, 0x22, 0x63, 0x25, + 0xb7, 0x1f, 0x4c, 0x17, 0x4e, 0x2e, 0x14, 0x1f, 0xc3, 0x6e, + 0xd7, 0x3c, 0x69, 0x15, 0xb7, 0xe8, 0x62, 0x2c, 0x7a, 0xbf, + 0x24, 0xe4, 0xa5, 0x50, 0xef, 0xa0, 0x53, 0x7c, 0x49, 0x20, + 0x37, 0x46, 0x27, 0x5f, 0x81, 0x1f, 0xb9, 0x08, 0x3b, 0x8f, + 0xcc, 0x13, 0xf6, 0x4c, 0x2f, 0x17, 0x68, 0x04, 0x7b, 0xd6, + 0x93, 0x1a, 0x50, 0x38, 0x2e, 0xda, 0x0a, 0x21, 0x4e, 0xf1, + 0x03, 0xea, 0xeb, 0x59, 0x63, 0xef, 0xf9, 0xf2, 0xcf, 0xa0, + 0x43, 0xc3, 0x6b, 0x9e, 0x07, 0x70, 0x05, 0x7d, 0xa5, 0x88, + 0x81, 0x45, 0xf1, 0x77, 0x5b, 0xd3, 0xc7, 0x3c, 0x0e, 0x3c, + 0xc5, 0x12, 0xc6, 0x37, 0xa9, 0xc7, 0x97, 0x84, 0xcf, 0x77, + 0xd1, 0x25, 0x98, 0x69, 0xf8, 0x1d, 0x8e, 0x97, 0xea, 0x8b, + 0xdb, 0xba, 0x02, 0x66, 0xee, 0x60, 0xea, 0x5b, 0x00, 0x7e, + 0xef, 0xed, 0x99, 0x1d, 0x97, 0xc5, 0x78, 0x53, 0x95, 0x7e, + 0xc6, 0xb2, 0x99, 0x51, 0xfa, 0xac, 0x55, 0xbe, 0x59, 0x1a, + 0xba, 0x87, 0x65, 0x19, 0x20, 0xe1, 0x3a, 0xa8, 0x52, 0xd1, + 0x3d, 0x77, 0xb9, 0x68, 0xdc, 0x63, 0x5a, 0xc0, 0xbc, 0x75, + 0x94, 0xc6, 0xa0, 0xc7, 0x31, 0xc2, 0x00, 0x63, 0x55, 0x74, + 0x5e, 0x79, 0x12, 0xdb, 0xc0, 0xbf, 0xd6, 0xbe, 0x5a, 0xa9, + 0xa9, 0xbd, 0xf2, 0xf8, 0x76, 0x1e, 0xfc, 0xc8, 0x77, 0x3b, + 0xb1, 0xc0, 0x4c, 0xa0, 0xa2, 0x92, 0xb5, 0x37, 0x63, 0x38, + 0x64, 0xf5, 0x06, 0xee, 0x53, 0x08, 0xc2, 0x70, 0xdc, 0x34, + 0x60, 0xa8, 0x23, 0x78, 0xb3, 0xed, 0xe6, 0x1d, 0x6a, 0x7e, + 0xbf, 0x88, 0x4c, 0x01, 0x9f, 0x52, 0x1a, 0xac, 0xa6, 0x57, + 0x6f, 0xb7, 0xae, 0x02, 0x4d, 0x96, 0x8b, 0xfe, 0x92, 0x09, + 0x01, 0xb5, 0xa2, 0x9e, 0x45, 0x09, 0x61, 0xbc, 0xa7, 0x70, + 0x4e, 0x5a, 0x0d, 0x88, 0xc2, 0xe0, 0xdd, 0x81, 0xf7, 0x6b, + 0xa8, 0x2d, 0xc7, 0xd9, 0x32, 0x8b, 0x29, 0x35, 0x65, 0x10, + 0xb2, 0x4a, 0xe9, 0xb0, 0x58, 0xd7, 0x78, 0xae, 0x66, 0x6c, + 0xd0, 0x6f, 0x4c, 0xfe, 0x5c, 0x09, 0xee, 0xb1, 0x99, 0xdd, + 0x67, 0xa2, 0x01, 0x87, 0xe8, 0xe1, 0xd5, 0x8e, 0xf3, 0x4c, + 0x68, 0x67, 0x4b, 0xb8, 0x73, 0x9a, 0xa9, 0x5c, 0x8a, 0x37, + 0x6b, 0xb9, 0xf0, 0x42, 0xb5, 0xbe, 0x69, 0x83, 0x60, 0x8c, + 0xdb, 0x33, 0xf0, 0x74, 0x57, 0x3b, 0x81, 0xee, 0xb2, 0x2b, + 0xf2, 0xf3, 0xad, 0xe0, 0x68, 0x94, 0xb4, 0xaa, 0x33, 0x43, + 0xe0, 0x3e, 0xbe, 0x7d, 0x58, 0xb9, 0xa2, 0x44, 0x80, 0x6f, + 0x63, 0xbe, 0xf9, 0xc3, 0x5a, 0x84, 0xfe, 0xbd, 0xbe, 0xcb, + 0xad, 0x3a, 0x77, 0x6c, 0x36, 0xc9, 0x94, 0x61, 0x17, 0xd8, + 0xa2, 0x5d, 0xe0, 0x60, 0xfa, 0xa5, 0xc3, 0x68, 0x73, 0xb8, + 0x81, 0xbb, 0x00, 0x6a, 0xc4, 0xdd, 0xb9, 0x10, 0x25, 0x75, + 0x93, 0xb6, 0x51, 0xca, 0xbf, 0x75, 0xca, 0x64, 0xdf, 0x67, + 0xe4, 0x62, 0xd8, 0x46, 0xed, 0x2a, 0xc2, 0xd7, 0x91, 0x99, + 0x27, 0x7a, 0xfc, 0x21, 0xaf, 0x3c, 0x7d, 0xc9, 0x2a, 0xfd, + 0x40, 0x6a, 0xca, 0xb4, 0x90, 0x78, 0x3c, 0xe7, 0xae, 0x62, + 0x04, 0x98, 0x7b, 0x01, 0xe8, 0xed, 0x6f, 0xe3, 0x3f, 0xed, + 0xc9, 0xee, 0x98, 0x66, 0x3a, 0xde, 0x89, 0xd6, 0xb8, 0x25, + 0xf0, 0x09, 0xd1, 0xa4, 0x77, 0xec, 0xa3, 0xd6, 0xdb, 0x15, + 0xe5, 0x3b, 0xdd, 0x36, 0x96, 0x69, 0x26, 0x99, 0x86, 0xcf, + 0x32, 0x6e, 0x4b, 0x57, 0x00, 0x52, 0x12, 0xa4, 0xf8, 0x37, + 0xf5, 0x27, 0x70, 0xc7, 0x4e, 0xb8, 0x09, 0xe5, 0x18, 0xaf, + 0x23, 0x59, 0xa6, 0xbe, 0xa5, 0x25, 0x37, 0xb1, 0xf0, 0x60, + 0x95, 0xcf, 0x6a, 0x8c, 0x96, 0xd8, 0x1c, 0x02, 0xd5, 0x5d, + 0x32, 0x44, 0x2b, 0x75, 0x52, 0xcf, 0x08, 0x74, 0x2e, 0x1c, + 0xfa, 0x91, 0x7e, 0xd5, 0x36, 0xe7, 0xfe, 0x85, 0x68, 0x09, + 0x45, 0xdb, 0x7e, 0x98, 0x2a, 0x7b, 0x6f, 0xe8, 0x04, 0x6c, + 0xf1, 0xb0, 0xda, 0x5d, 0x9c, 0xb1, 0x81, 0xe7, 0x1d, 0x16, + 0x33, 0xb6, 0xb8, 0x10, 0xf0, 0xfd, 0x25, 0x1f, 0xb8, 0xf1, + 0xea, 0x60, 0x58, 0xe5, 0x3a, 0xf0, 0x6a, 0xbd, 0x05, 0xc1, + 0xe1, 0x5c, 0x8e, 0x16, 0xdb, 0x61, 0xb8, 0x07, 0x97, 0x1d, + 0x84, 0x76, 0x44, 0x43, 0x55, 0xc5, 0x2f, 0x07, 0x39, 0x76, + 0xfb, 0x7d, 0x65, 0xb2, 0xdc, 0xdc, 0x9e, 0xed, 0x17, 0x38, + 0xdf, 0x1b, 0xb9, 0x27, 0xe1, 0xbb, 0x7d, 0x17, 0x23, 0xd1, + 0x9b, 0x49, 0x20, 0xb2, 0x3b, 0x86, 0x9e, 0xae, 0x86, 0xfd, + 0xc1, 0xbf, 0x4d, 0x31, 0x35, 0xc7, 0xd1, 0x49, 0xe7, 0x8d, + 0x83, 0xd2, 0xcc, 0x75, 0x3a, 0x25, 0x69, 0x82, 0x68, 0x72, + 0x46, 0x3f, 0x6d, 0xa4, 0xfa, 0x73, 0xa5, 0x30, 0x5c, 0x2f, + 0xd8, 0x61, 0xa0, 0x1e, 0x75, 0xcc, 0x8f, 0x56, 0x8c, 0x13, + 0xc6, 0x81, 0x69, 0x3b, 0x18, 0x86, 0x04, 0x08, 0x6b, 0xdd, + 0x89, 0x08, 0x6a, 0x09, 0x7f, 0xd6, 0x96, 0x86, 0x60, 0x1e, + 0x1a, 0x5b, 0x5d, 0xc3, 0x6a, 0x40, 0xc8, 0x4c, 0x49, 0x3b, + 0xed, 0xfa, 0xe0, 0x59, 0x5a, 0x85, 0x62, 0xa2, 0xce, 0x1b, + 0x02, 0xa0, 0x8f, 0x3b, 0xdb, 0x54, 0xe4, 0x83, 0x99, 0xb5, + 0xdd, 0xb0, 0xb0, 0x0b, 0xb0, 0xfa, 0x63, 0x5c, 0xab, 0x9b, + 0xe7, 0x21, 0xa4, 0x31, 0xa9, 0x59, 0x62, 0x07, 0x6f, 0x9c, + 0x01, 0x4a, 0x33, 0xe3, 0xa4, 0xba, 0xae, 0x8a, 0xf7, 0x4c, + 0x3f, 0x06, 0xa1, 0x61, 0x2b, 0xa5, 0xc3, 0xb1, 0xe0, 0xe6, + 0x42, 0x40, 0x21, 0x3b, 0x00, 0x3f, 0xbd, 0x2e, 0xfd, 0xfc, + 0xa8, 0x93, 0x2c, 0x9f, 0x33, 0x1b, 0x1b, 0xe0, 0x59, 0xa5, + 0xf6, 0xf4, 0x6b, 0x79, 0x07, 0x95, 0x55, 0x0e, 0x4c, 0x2c, + 0x9c, 0x65, 0xca, 0xb5, 0x0b, 0x5e, 0x8a, 0xaf, 0xf9, 0x48, + 0xad, 0xee, 0x75, 0x5d, 0xf4, 0x18, 0xf2, 0x24, 0x80, 0xd1, + 0x84, 0xd5, 0x72, 0x1e, 0xe9, 0x1a, 0xbc, 0x08, 0x44, 0xbd, + 0x4c, 0xf3, 0xa9, 0xaa, 0x83, 0xe2, 0xb8, 0x75, 0x89, 0x00, + 0x22, 0x84, 0x76, 0x0b, 0xf9, 0x11, 0xd9, 0x23, 0xd6, 0xc3, + 0x02, 0x5d, 0x80, 0xc9, 0x0d, 0xa0, 0x01, 0xf1, 0xfe, 0x5c, + 0x70, 0xad, 0x47, 0x58, 0xb4, 0x0b, 0x6e, 0xe6, 0xd6, 0xbc, + 0x21, 0xfd, 0xd6, 0x2d, 0x8c, 0x98, 0x56, 0x57, 0x6a, 0xaa, + 0x3f, 0xf2, 0x72, 0x68, 0x97, 0x8a, 0x38, 0x8b, 0xd4, 0xf6, + 0xe7, 0x97, 0x77, 0xf5, 0xfe, 0x3f, 0x74, 0x1b, 0xfa, 0xbc, + 0x43, 0xc9, 0xc5, 0x46, 0x3c, 0xa1, 0x1f, 0x20, 0x9a, 0xe0, + 0xcb, 0xa4, 0x92, 0x3e, 0xd4, 0xf7, 0xdb, 0x72, 0x17, 0x6a, + 0x7c, 0xb5, 0xed, 0x96, 0xf8, 0x08, 0xf9, 0x95, 0x1f, 0xa4, + 0xb1, 0xeb, 0x43, 0x7b, 0x54, 0x8b, 0x81, 0x03, 0xe9, 0xf0, + 0x27, 0x82, 0x12, 0x38, 0x2f, 0xdf, 0x2b, 0x84, 0x7f, 0x4b, + 0x21, 0x05, 0x73, 0x1c, 0x14, 0xfb, 0xd2, 0x8e, 0xfe, 0x26, + 0x56, 0xed, 0x69, 0x91, 0x04, 0x53, 0xea, 0x98, 0xde, 0x6a, + 0xd1, 0x36, 0xb5, 0x8e, 0x63, 0x85, 0xf1, 0x3c, 0xcb, 0xe5, + 0x10, 0x4e, 0xd3, 0x36, 0xb9, 0xab, 0x03, 0x39, 0x09, 0x8e, + 0xf5, 0xd3, 0xf7, 0x35, 0x70, 0x3b, 0x0b, 0x98, 0xb7, 0xb6, + 0x58, 0x31, 0x20, 0xed, 0x10, 0x7a, 0x63, 0x9a, 0x3c, 0x3d, + 0x4e, 0x34, 0x40, 0x1d, 0xdb, 0x43, 0x2f, 0x6f, 0x74, 0x51, + 0xc3, 0x77, 0x76, 0xe8, 0x5d, 0x0d, 0x0f, 0x22, 0xd7, 0x3f, + 0xfb, 0xf5, 0xce, 0x6a, 0x60, 0x81, 0x42, 0x64, 0x81, 0x45, + 0x24, 0xeb, 0x81, 0xf8, 0xe5, 0x13, 0xd7, 0xec, 0x07, 0x1d, + 0x5c, 0x42, 0xe2, 0x9b, 0x98, 0x9e, 0x57, 0x0f, 0xed, 0xd8, + 0x26, 0x91, 0x8c, 0x50, 0x69, 0x5b, 0x3a, 0x0a, 0x8e, 0x23, + 0x20, 0xaa, 0x3e, 0x3d, 0xd5, 0x83, 0x87, 0x8c, 0xe2, 0xe7, + 0xbc, 0x24, 0x90, 0x4e, 0x1a, 0x69, 0x18, 0x0a, 0x7e, 0x68, + 0x1c, 0x6f, 0xd2, 0x26, 0x76, 0x37, 0xe5, 0x1d, 0xe1, 0x96, + 0x42, 0x74, 0x72, 0x16, 0x8b, 0xa3, 0x90, 0xa1, 0x91, 0x48, + 0x4b, 0x56, 0x09, 0x2e, 0xea, 0xa3, 0x08, 0xd9, 0x2f, 0x04, + 0x51, 0x60, 0x65, 0x3b, 0x4b, 0x95, 0xba, 0xe8, 0x87, 0x16, + 0xe5, 0x3f, 0xb8, 0x42, 0x08, 0xdb, 0x82, 0x7b, 0xe7, 0xf6, + 0xb7, 0x6d, 0xc2, 0x82, 0xb6, 0x4f, 0xac, 0xcc, 0x70, 0x2f, + 0x56, 0xbc, 0xd4, 0x79, 0x96, 0xca, 0x02, 0x38, 0x40, 0xd3, + 0x0d, 0x8b, 0xd9, 0x53, 0x4d, 0xd7, 0xe6, 0x96, 0x70, 0x71, + 0x6a, 0xf5, 0xd3, 0xdb, 0xa7, 0x42, 0xe7, 0x6c, 0x04, 0xf1, + 0x8c, 0xe4, 0xf4, 0x57, 0x32, 0xf6, 0x94, 0x80, 0x5e, 0xa3, + 0xe2, 0x2e, 0x59, 0xb3, 0xf9, 0x4e, 0x40, 0x9a, 0x72, 0x7d, + 0xa5, 0xc8, 0x83, 0xc1, 0xae, 0x84, 0x41, 0x3e, 0x9b, 0xa6, + 0x9b, 0x5b, 0xf9, 0xc1, 0xe4, 0xad, 0x0f, 0xc7, 0x3a, 0x62, + 0xd0, 0x0b, 0x43, 0xd0, 0x59, 0x91, 0x25, 0x26, 0x91, 0x0a, + 0xe1, 0x44, 0xdf, 0xdf, 0xbb, 0xca, 0xec, 0x04, 0x8a, 0xb7, + 0x21, 0x18, 0x93, 0x5a, 0xac, 0x03, 0xe4, 0x6a, 0xf5, 0x2b, + 0x2b, 0xb5, 0xc9, 0x07, 0x62, 0x44, 0xf6, 0x2e, 0xc5, 0x03, + 0xb7, 0xb1, 0xd3, 0x18, 0xe9, 0xd3, 0x93, 0xd1, 0x52, 0x8c, + 0x36, 0xd1, 0x89, 0xfd, 0xca, 0x71, 0x42, 0x36, 0xd0, 0x3b, + 0x7b, 0x9b, 0xcb, 0x22, 0xf8, 0x52, 0xcc, 0x56, 0xb8, 0xfe, + 0x0d, 0xc2, 0x49, 0x0a, 0x4d, 0xc6, 0x66, 0x52, 0xa4, 0x31, + 0x3c, 0x79, 0x9c, 0xd4, 0x9b, 0x84, 0x13, 0xec, 0x8f, 0xa1, + 0x70, 0xd2, 0xf7, 0x73, 0x71, 0x47, 0x58, 0xef, 0x29, 0x40, + 0x48, 0x76, 0x11, 0x49, 0x9f, 0xf9, 0x6f, 0x93, 0xcd, 0xa4, + 0x3e, 0xea, 0xf3, 0xfd, 0x14, 0xa0, 0xce, 0x0e, 0x29, 0x53, + 0xfb, 0xbc, 0xfe, 0xcf, 0x1f, 0x62, 0x26, 0x65, 0xd1, 0x57, + 0x6e, 0x6f, 0xac, 0x29, 0x83, 0x44, 0x2e, 0x56, 0x48, 0xba, + 0xca, 0x1f, 0x60, 0x54, 0xd6, 0x50, 0xf9, 0xc8, 0xdf, 0xd0, + 0xa4, 0xe1, 0x49, 0x0e, 0x30, 0x1b, 0x7b, 0xb5, 0x78, 0x17, + 0xa4, 0x94, 0x0a, 0x5f, 0xd3, 0x12, 0x4a, 0x0c, 0xb1, 0x35, + 0xdd, 0x1c, 0x54, 0x83, 0xc3, 0x85, 0x06, 0x4b, 0x18, 0x46, + 0x20, 0xf4, 0x24, 0x77, 0x22, 0x34, 0x58, 0x93, 0xf2, 0xd8, + 0x03, 0xac, 0x64, 0x08, 0x88, 0xb8, 0x9a, 0x21, 0x94, 0xda, + 0xe1, 0x64, 0xc8, 0x2b, 0x30, 0xa8, 0xc4, 0xb9, 0x54, 0x6e, + 0xef, 0x5b, 0x5d, 0x62, 0x6e, 0xda, 0x76, 0xf6, 0x59, 0x54, + 0x1f, 0x62, 0x1c, 0xcf, 0xef, 0x84, 0x8e, 0x04, 0x6a, 0x62, + 0x28, 0x39, 0x24, 0xae, 0xa9, 0x33, 0x48, 0x69, 0x0f, 0x63, + 0x65, 0x01, 0x2a, 0xb6, 0x15, 0xf0, 0x73, 0x09, 0xce, 0x81, + 0xfb, 0x3d, 0x81, 0x2e, 0x46, 0x35, 0x33, 0xf9, 0x59, 0x99, + 0xd5, 0x33, 0x42, 0x08, 0xbd, 0xfe, 0x59, 0xe1, 0x98, 0x7d, + 0xdd, 0x46, 0x44, 0x6e, 0x6d, 0x08, 0xf2, 0xfa, 0xdd, 0xce, + 0xb1, 0x94, 0x09, 0x6c, 0x21, 0xb2, 0x37, 0xf4, 0x46, 0x36, + 0xdc, 0xb5, 0xcc, 0xd5, 0x68, 0x5d, 0x8a, 0x0e, 0xc3, 0xa4, + 0xab, 0xd5, 0xf0, 0x70, 0xe7, 0x80, 0xdc, 0xcb, 0xe2, 0xba, + 0x79, 0xa9, 0xed, 0x91, 0x4b, 0x20, 0x5e, 0xab, 0x6f, 0xb7, + 0xa2, 0x09, 0x18, 0x57, 0xad, 0xee, 0xc9, 0xa1, 0x4e, 0xa0, + 0x78, 0xb5, 0x37, 0x87, 0xad, 0xee, 0xf0, 0x5e, 0x28, 0x40, + 0x4b, 0xab, 0x86, 0xb4, 0x36, 0x83, 0x48, 0xe5, 0xc0, 0x9f, + 0x1d, 0x3f, 0xdc, 0x19, 0xf7, 0xd1, 0x7b, 0x9c, 0xe5, 0xe6, + 0x06, 0xbb, 0x95, 0xae, 0x8d, 0xea, 0x22, 0x3e, 0x49, 0x23, + 0xf0, 0x9c, 0xa4, 0x10, 0x41, 0x12, 0x59, 0x60, 0xad, 0xbb, + 0xa6, 0xf1, 0xec, 0xb2, 0xb5, 0xfe, 0x2e, 0x35, 0x26, 0x7e, + 0x6e, 0x06, 0xc7, 0xa2, 0x61, 0xe3, 0x22, 0x3f, 0x89, 0x8e, + 0x6a, 0xeb, 0xe3, 0x2c, 0xcf, 0xcd, 0xb0, 0xd6, 0x65, 0xd0, + 0x9b, 0xd5, 0x6a, 0x79, 0xd6, 0xf7, 0x3e, 0x87, 0xde, 0x15, + 0xb6, 0x19, 0x79, 0xc3, 0xbf, 0x7b, 0xc0, 0xf6, 0x90, 0x5c, + 0x50, 0x2a, 0x85, 0x03, 0x82, 0x56, 0x31, 0x62, 0x78, 0x84, + 0x14, 0x8e, 0x3e, 0x0c, 0x3f, 0x6d, 0x49, 0x9b, 0x33, 0x82, + 0x19, 0x39, 0x2f, 0x40, 0x8e, 0x20, 0x71, 0x64, 0x6a, 0x07, + 0x55, 0xed, 0xd4, 0xc9, 0x77, 0xa8, 0xf2, 0xb2, 0x40, 0xb3, + 0x91, 0xa1, 0xa5, 0xcf, 0x4d, 0x43, 0x3a, 0x7c, 0xa7, 0xa9, + 0xe0, 0x10, 0x60, 0xf3, 0x10, 0xb9, 0xb8, 0x8b, 0x93, 0x23, + 0x58, 0x18, 0x0f, 0x20, 0x87, 0x1f, 0x3f, 0x5e, 0xc5, 0x66, + 0xdd, 0x06, 0x13, 0x54, 0x8d, 0xfe, 0x83, 0xeb, 0x19, 0x3e, + 0xd3, 0x7b, 0x51, 0x11, 0xcd, 0xc8, 0x4d, 0x93, 0x50, 0x63, + 0x66, 0xa7, 0xcd, 0x61, 0x1a, 0x03, 0xdb, 0xcf, 0x7d, 0x4a, + 0x88, 0xd3, 0xa4, 0x10, 0x1f, 0x28, 0x31, 0xe0, 0x8a, 0xdf, + 0xd2, 0xac, 0x48, 0xd3, 0x8e, 0x79, 0x69, 0xd3, 0xf5, 0xc1, + 0x86, 0x79, 0x71, 0x73, 0x1f, 0x47, 0x7e, 0x8b, 0x3c, 0x12, + 0x24, 0xfe, 0x98, 0xdf, 0xc4, 0x10, 0xae, 0x84, 0xb8, 0xfe, + 0xa4, 0x49, 0x67, 0xf9, 0xbd, 0xa7, 0x03, 0x1b, 0xd5, 0x40, + 0x3a, 0xd5, 0xb7, 0x5c, 0xd4, 0x94, 0x1e, 0x96, 0x46, 0x01, + 0xf8, 0xb0, 0x20, 0xcf, 0x02, 0x59, 0xcf, 0x11, 0x37, 0xd2, + 0xe7, 0x56, 0xc9, 0x89, 0xc7, 0x77, 0x0b, 0x24, 0x04, 0xca, + 0xa6, 0x44, 0x43, 0xda, 0x28, 0x86, 0x9f, 0x5b, 0x06, 0x1b, + 0xc4, 0xcd, 0xa0, 0x9c, 0x89, 0x18, 0x03, 0x47, 0x50, 0x6d, + 0xe8, 0xcd, 0x18, 0x53, 0x10, 0xef, 0xa2, 0xa2, 0xf6, 0xcf, + 0xdf, 0x92, 0x1a, 0x50, 0x3f, 0xdb, 0x8e, 0x58, 0x40, 0x24, + 0xeb, 0xd0, 0xbd, 0x5f, 0x8b, 0x01, 0xb1, 0x65, 0x59, 0x18, + 0xd9, 0x62, 0xda, 0x70, 0x64, 0x10, 0x15, 0xce, 0x0b, 0x26, + 0xc0, 0x98, 0xd2, 0xa9, 0x19, 0x65, 0xdb, 0x22, 0x77, 0x8f, + 0x80, 0x79, 0x51, 0xc4, 0xa4, 0x67, 0xcf, 0x89, 0x8c, 0x9a, + 0xd3, 0x80, 0x2f, 0x18, 0x78, 0x26, 0x16, 0xeb, 0xe0, 0x17, + 0x69, 0xa5, 0x17, 0x9c, 0xa2, 0x8f, 0x08, 0x74, 0xad, 0x67, + 0x8f, 0xaa, 0x91, 0x0a, 0x9e, 0x56, 0x83, 0x14, 0x36, 0x34, + 0xcc, 0xef, 0xfe, 0x17, 0x9f, 0xc3, 0xdd, 0x12, 0x5c, 0x23, + 0xa7, 0x70, 0x58, 0x48, 0x06, 0x9c, 0x36, 0xe5, 0x2a, 0xfa, + 0x65, 0xe3, 0xd4, 0xdb, 0x48, 0xd7, 0xc6, 0xd0, 0xc8, 0x0a, + 0x29, 0x01, 0x76, 0x0a, 0x7f, 0x08, 0xd6, 0xda, 0x50, 0x0f, + 0x37, 0x30, 0xc4, 0xdd, 0x4e, 0x4a, 0x1f, 0x05, 0x0c, 0x30, + 0x6b, 0xa1, 0x55, 0x5a, 0xc8, 0x1d, 0x05, 0x81, 0xc6, 0x66, + 0xc0, 0x05, 0xd6, 0x65, 0xde, 0x29, 0x8e, 0xf3, 0xc7, 0x50, + 0x9a, 0x0e, 0x70, 0xbf, 0x32, 0xec, 0xa9, 0x3e, 0xf2, 0x09, + 0x36, 0xdb, 0xd9, 0x6f, 0xfc, 0x95, 0xb4, 0x8b, 0x4d, 0x4a, + 0x1a, 0xce, 0x31, 0x3f, 0xf3, 0x8b, 0x2f, 0x86, 0xb0, 0x34, + 0xa8, 0x0d, 0x5f, 0x19, 0x16, 0x18, 0x57, 0xf0, 0xa7, 0x4d, + 0x00, 0xc7, 0x81, 0x4a, 0x38, 0x5a, 0x3e, 0x03, 0x53, 0x87, + 0xd1, 0x55, 0xc8, 0x9b, 0xe7, 0xe5, 0x2f, 0xd8, 0x22, 0x3d, + 0xe2, 0xe2, 0xa1, 0x09, 0xf7, 0xc3, 0x77, 0x18, 0x06, 0xd6, + 0x45, 0x65, 0x3c, 0x53, 0xe0, 0x49, 0x80, 0x83, 0x52, 0x95, + 0x91, 0x51, 0x03, 0xb4, 0x06, 0x43, 0x84, 0x75, 0x90, 0xd9, + 0xfe, 0x3e, 0xc4, 0x0f, 0xee, 0x2d, 0x5e, 0xdc, 0x40, 0xaf, + 0x83, 0xb8, 0x77, 0x70, 0x7d, 0x68, 0xef, 0xe8, 0x58, 0x4a, + 0x2d, 0x15, 0xba, 0x41, 0x22, 0xd4, 0xc0, 0xf3, 0x00, 0x3f, + 0xb3, 0x13, 0x06, 0xe5, 0xe8, 0xfe, 0xfc, 0x40, 0xda, 0x39, + 0x38, 0x74, 0x87, 0xf2, 0x55, 0x01, 0x7c, 0x84, 0x0f, 0x16, + 0x52, 0x80, 0x0a, 0x91, 0xe8, 0x0d, 0xde, 0xc6, 0xeb, 0x5d, + 0x06, 0x5e, 0x89, 0x98, 0x60, 0xc9, 0x31, 0x38, 0x20, 0x29, + 0xe1, 0xcc, 0x1c, 0x29, 0xc8, 0xb6, 0xf3, 0x6b, 0x9b, 0x84, + 0xf6, 0x62, 0x4e, 0x5c, 0xa2, 0x2a, 0x7a, 0x71, 0xf6, 0x6f, + 0xb0, 0xa3, 0x8e, 0x37, 0x79, 0xa7, 0x35, 0x85, 0xbf, 0xc4, + 0x59, 0x51, 0xb8, 0x50, 0x1a, 0xd4, 0x30, 0x88, 0x14, 0x0d, + 0x21, 0x7f, 0x5c, 0xe0, 0xf3, 0x5d, 0x02, 0x92, 0xee, 0x68, + 0x8b, 0x07, 0x71, 0x71, 0xd9, 0x93, 0x51, 0x5e, 0xb4, 0xe5, + 0x50, 0x12, 0x1e, 0x14, 0x7b, 0x4d, 0xf8, 0xb0, 0x1a, 0x59, + 0x96, 0x9b, 0x8d, 0x3e, 0x6d, 0x6b, 0x8d, 0xf1, 0x1f, 0x4f, + 0x61, 0x22, 0x15, 0xa4, 0x61, 0x25, 0x8d, 0x4d, 0x86, 0xa8, + 0xfd, 0xf4, 0x7a, 0xdb, 0x50, 0x9c, 0xce, 0xe8, 0x9e, 0x19, + 0xe7, 0x4e, 0x28, 0x7b, 0x06, 0xab, 0x56, 0x22, 0xea, 0xa0, + 0xb2, 0x33, 0xb1, 0x4f, 0x6e, 0x29, 0xa4, 0x40, 0x3d, 0xa3, + 0x0e, 0x5b, 0x0e, 0xdc, 0x4b, 0x63, 0x66, 0xee, 0x0f, 0x1b, + 0xee, 0x74, 0x04, 0xd9, 0x23, 0x76, 0xf7, 0x01, 0x2e, 0x91, + 0x48, 0x2e, 0xc1, 0x02, 0x7b, 0x97, 0xde, 0x27, 0x81, 0x53, + 0x29, 0xd7, 0x6c, 0xe4, 0x9d, 0x42, 0xcf, 0xea, 0x3d, 0xf8, + 0xbb, 0xf2, 0x58, 0x7b, 0x06, 0x09, 0x57, 0xb8, 0xb2, 0x37, + 0x62, 0xc1, 0x92, 0xd9, 0xea, 0x0c, 0x54, 0xe6, 0xcd, 0x05, + 0x78, 0xe3, 0x90, 0x0d, 0x1e, 0xeb, 0x8e, 0x80, 0x9b, 0x14, + 0x0a, 0xae, 0xa7, 0x5c, 0xab, 0x91, 0x55, 0x6f, 0x8c, 0x24, + 0xe8, 0x13, 0x78, 0x39, 0x7f, 0x85, 0x22, 0xcc, 0xde, 0x6a, + 0x3a, 0x6a, 0x7b, 0x9e, 0x07, 0x2d, 0xb9, 0x7c, 0x01, 0x17, + 0x59, 0x2a, 0x7f, 0xd6, 0x0c, 0xe0, 0xdc, 0xc4, 0x55, 0xdd, + 0x1c, 0x30, 0x75, 0xdb, 0x81, 0x96, 0xe9, 0x90, 0xa6, 0x48, + 0x4a, 0x88, 0x76, 0xdb, 0x3a, 0xc7, 0xd0, 0x1b, 0x9f, 0x9d, + 0x39, 0xba, 0x70, 0x8a, 0xbe, 0xdc, 0x8f, 0xd0, 0x42, 0x6c, + 0x67, 0xf8, 0x7f, 0x49, 0x4e, 0x5d, 0x22, 0xee, 0x02, 0xec, + 0x29, 0xb0, 0xb1, 0xf8, 0x06, 0x16, 0x67, 0xd2, 0x3f, 0x8e, + 0x88, 0x25, 0x3b, 0x96, 0x89, 0x49, 0xf4, 0x58, 0xb0, 0xd4, + 0x05, 0xfa, 0x48, 0x8f, 0x40, 0xcd, 0xa5, 0xae, 0x3d, 0xec, + 0x3a, 0x26, 0xa4, 0x71, 0x51, 0x19, 0x1d, 0x5a, 0x2a, 0x08, + 0xc8, 0x12, 0x5a, 0x19, 0x66, 0x84, 0xa6, 0xb6, 0xd3, 0xc0, + 0x34, 0x22, 0xe9, 0xfe, 0x84, 0x03, 0x09, 0x87, 0xbe, 0xa9, + 0xb5, 0x2e, 0xdb, 0xa4, 0xdb, 0x2b, 0x94, 0xc4, 0xd1, 0x72, + 0xf3, 0x87, 0xad, 0x49, 0x10, 0x8d, 0x65, 0x27, 0x82, 0x12, + 0x96, 0x3d, 0x7c, 0x6f, 0xd2, 0xad, 0x9d, 0xfb, 0x0e, 0x51, + 0x3e, 0x5a, 0x4d, 0x0f, 0xb9, 0x3e, 0x67, 0xce, 0x2b, 0x90, + 0xbe, 0x11, 0x9f, 0xd2, 0x42, 0x15, 0x57, 0x31, 0x9a, 0x40, + 0x98, 0x60, 0x24, 0xce, 0x95, 0x9e, 0x06, 0xeb, 0x09, 0xd1, + 0x16, 0x9b, 0x3b, 0xbf, 0x8c, 0x55, 0x90, 0x90, 0x63, 0x4b, + 0x4e, 0x6b, 0xc2, 0x50, 0xe2, 0x5f, 0x48, 0x8d, 0xc2, 0x3f, + 0xc4, 0xf0, 0x19, 0xbe, 0x6c, 0xb2, 0x1b, 0xe7, 0xcd, 0xa5, + 0x1f, 0xa6, 0x10, 0xeb, 0x33, 0x41, 0xf9, 0x30, 0x4c, 0x8d, + 0x23, 0xc3, 0xa0, 0xee, 0x77, 0x90, 0x93, 0x75, 0x8d, 0x6b, + 0x8f, 0xea, 0x33, 0xf3, 0xc1, 0xea, 0x57, 0x72, 0x39, 0x2d, + 0x5c, 0xa6, 0xb8, 0xe6, 0x9e, 0x5d, 0x89, 0x2f, 0xbe, 0xf5, + 0x1f, 0x67, 0x8c, 0xf0, 0x06, 0x74, 0x3a, 0x83, 0x24, 0x63, + 0xf7, 0x41, 0x36, 0xa3, 0x6e, 0x4c, 0x3a, 0xe6, 0x45, 0x92, + 0xe7, 0x54, 0x28, 0xe2, 0x25, 0xaf, 0x3a, 0xd8, 0xe4, 0x29, + 0x32, 0x43, 0x67, 0x12, 0x34, 0x34, 0x3b, 0x34, 0xea, 0xed, + 0x7d, 0xd6, 0x1d, 0x86, 0x22, 0xd8, 0x89, 0xb5, 0xdc, 0x23, + 0x57, 0x75, 0x50, 0xc5, 0x08, 0x24, 0xc7, 0x1a, 0x94, 0xbe, + 0x30, 0x35, 0x7d, 0x53, 0x55, 0x5f, 0xe0, 0x16, 0x43, 0x1c, + 0x82, 0xa6, 0x83, 0xfb, 0x57, 0x5b, 0x38, 0x8f, 0x43, 0xc7, + 0x17, 0x0b, 0x33, 0x3c, 0x96, 0x7e, 0xce, 0x10, 0x8b, 0x60, + 0xab, 0xa2, 0x26, 0x6d, 0x78, 0x26, 0x5b, 0xb8, 0xd7, 0x58, + 0x8c, 0x09, 0x2f, 0x3a, 0x61, 0x1f, 0x65, 0xe1, 0xcb, 0xfd, + 0x9c, 0x5a, 0x31, 0x09, 0x67, 0x93, 0xed, 0x91, 0xa7, 0x4b, + 0xaa, 0xe5, 0xa1, 0xb9, 0x99, 0x15, 0x7f, 0xe0, 0xa6, 0x3b, + 0x6e, 0x79, 0x5d, 0xe2, 0x05, 0xd6, 0x25, 0xe8, 0xbb, 0x26, + 0xc1, 0x0f, 0x88, 0xce, 0xd7, 0xfd, 0xcf, 0x34, 0x89, 0xd1, + 0xb2, 0xdb, 0xf6, 0xba, 0xc0, 0x81, 0x67, 0x8f, 0x9a, 0xdf, + 0x05, 0xe2, 0x56, 0xdd, 0xdd, 0xee, 0xa5, 0x4c, 0x74, 0xc6, + 0x7b, 0x99, 0xec, 0xf5, 0x0b, 0x6c, 0xd1, 0xbc, 0x1c, 0x29, + 0x58, 0x0a, 0x13, 0xe3, 0x13, 0xb9, 0x77, 0x4c, 0xc1, 0x96, + 0xb7, 0xa8, 0x3f, 0x6d, 0x1a, 0x8c, 0x9d, 0xae, 0xa2, 0x08, + 0x09, 0x1c, 0x52, 0xe2, 0xa4, 0xd8, 0x67, 0x6a, 0xdf, 0x73, + 0xfc, 0x2a, 0x26, 0xce, 0x29, 0x5e, 0x06, 0x38, 0x43, 0xcc, + 0xb8, 0x34, 0xe0, 0x57, 0xbb, 0x15, 0x11, 0xf7, 0x64, 0x5c, + 0x36, 0x7a, 0x71, 0x92, 0x9d, 0xb2, 0x83, 0xb1, 0xa1, 0xdb, + 0xbd, 0xe5, 0xcd, 0xe1, 0x0b, 0x73, 0x71, 0x98, 0x34, 0xde, + 0x2d, 0x4d, 0x57, 0x10, 0x83, 0xe2, 0x6f, 0xe0, 0x77, 0xa1, + 0x7c, 0x75, 0x8e, 0x92, 0xea, 0xec, 0x40, 0x8d, 0xc2, 0x86, + 0x6b, 0x09, 0xd5, 0x30, 0x97, 0xb5, 0xc3, 0x1a, 0x79, 0x41, + 0x37, 0x87, 0x2e, 0xcc, 0x08, 0xb0, 0x04, 0xf2, 0x24, 0x04, + 0x23, 0xab, 0x6a, 0x0d, 0xf6, 0x5d, 0x41, 0xfb, 0x9d, 0x2b, + 0x0e, 0xf9, 0xee, 0x49, 0x53, 0x59, 0x8b, 0xc4, 0x9c, 0x42, + 0xcf, 0xec, 0x60, 0xb3, 0xd1, 0xe6, 0x74, 0x5a, 0xd3, 0x2e, + 0x85, 0xbe, 0x86, 0xb1, 0x94, 0x4e, 0x2b, 0xaf, 0x1b, 0xc3, + 0xe6, 0xdd, 0x0f, 0x5e, 0x6d, 0x46, 0x86, 0x8a, 0xe4, 0x32, + 0x93, 0x46, 0x32, 0x12, 0x48, 0x16, 0x2a, 0x81, 0x50, 0x95, + 0x40, 0xaf, 0x2a, 0xe6, 0x3b, 0x2e, 0xab, 0x91, 0xb2, 0x3d, + 0x81, 0x54, 0xd1, 0x8a, 0xc4, 0xb9, 0xed, 0x0a, 0x42, 0xdf, + 0xc5, 0x89, 0x1e, 0x3f, 0x61, 0xc5, 0x1e, 0x7c, 0xb3, 0x3d, + 0x0c, 0x95, 0x55, 0x34, 0xfa, 0x18, 0xce, 0x91, 0x5e, 0xb3, + 0x57, 0xfd, 0xcb, 0x5d, 0xaa, 0x38, 0x81, 0xae, 0xf4, 0xbf, + 0xf4, 0xc2, 0x38, 0x41, 0x68, 0x2c, 0x27, 0xeb, 0xfc, 0xc5, + 0xa2, 0x5c, 0x51, 0x92, 0x65, 0xf1, 0x45, 0xf9, 0xbd, 0x55, + 0x31, 0xb3, 0xa4, 0x56, 0x14, 0x92, 0xa7, 0x18, 0xcd, 0x54, + 0x80, 0x6e, 0x68, 0x39, 0xec, 0x7f, 0x3e, 0x91, 0x52, 0xfd, + 0xca, 0x05, 0x10, 0xb5, 0xe2, 0x23, 0x4e, 0x3f, 0xe7, 0xc6, + 0xef, 0xa0, 0x06, 0xe0, 0x6f, 0x36, 0x2b, 0xd1, 0xf7, 0x7d, + 0x07, 0xcd, 0xa9, 0xaf, 0xca, 0x46, 0xf2, 0x5b, 0x75, 0x25, + 0xa6, 0xba, 0x16, 0xa8, 0x28, 0xb5, 0xb3, 0x4f, 0xcc, 0x0a, + 0xf3, 0xaf, 0x60, 0xbd, 0x8d, 0x50, 0xb3, 0xbd, 0x29, 0xe2, + 0x43, 0xd8, 0xbf, 0x68, 0xec, 0x86, 0x3b, 0xac, 0xcc, 0xaa, + 0x70, 0xb8, 0x5b, 0xc3, 0xa4, 0xfc, 0xfb, 0x5d, 0x5c, 0x38, + 0x09, 0xcf, 0xee, 0x27, 0x87, 0x3c, 0x88, 0x48, 0xa0, 0x67, + 0x24, 0xf5, 0xcb, 0x60, 0xae, 0x8c, 0x7c, 0xe6, 0x23, 0x1d, + 0x23, 0x67, 0xa5, 0xb9, 0x20, 0xc1, 0xed, 0xae, 0xfd, 0xe9, + 0x35, 0x43, 0x99, 0x39, 0x27, 0x55, 0x7c, 0xe4, 0x30, 0x76, + 0xc7, 0x68, 0xc2, 0xb9, 0x95, 0xc6, 0x9a, 0x3f, 0x14, 0x1e, + 0xd5, 0x1a, 0x7e, 0xf9, 0x76, 0x99, 0x4b, 0x42, 0x5b, 0xfa, + 0x78, 0x4a, 0x15, 0xa3, 0xef, 0x2e, 0xae, 0x58, 0x2a, 0x81, + 0xb8, 0xf2, 0xb4, 0x49, 0x13, 0x32, 0x20, 0xd2, 0x8d, 0x66, + 0xc9, 0x21, 0xf9, 0x1a, 0x60, 0x75, 0x47, 0xc7, 0xab, 0x00, + 0xe8, 0xe7, 0xd8, 0xa2, 0x2b, 0x01, 0x48, 0x0d, 0x93, 0xfc, + 0x40, 0xc8, 0x88, 0x73, 0xd3, 0x29, 0x73, 0xda, 0x2d, 0x6d, + 0xc8, 0xd0, 0x66, 0x78, 0x69, 0x37, 0xc2, 0xd3, 0x1d, 0xab, + 0x5d, 0xa2, 0xbc, 0xd5, 0x0d, 0x74, 0xd4, 0xf4, 0x21, 0x86, + 0xc6, 0x00, 0x56, 0x28, 0x35, 0xc1, 0xfd, 0xad, 0x6b, 0x17, + 0xf0, 0xf4, 0x39, 0x82, 0x3b, 0xd6, 0xce, 0xbc, 0xa0, 0x59, + 0xcb, 0xec, 0xe6, 0xf2, 0xb9, 0xcb, 0x0e, 0x56, 0x58, 0x2e, + 0x11, 0xf6, 0x4e, 0x0b, 0xa6, 0xde, 0xb5, 0xde, 0x76, 0x32, + 0xbf, 0xa3, 0x52, 0x62, 0xbf, 0x3e, 0x13, 0x81, 0x4e, 0x2e, + 0xb7, 0x0b, 0xe4, 0xa3, 0x4e, 0xf1, 0x8f, 0x4d, 0x62, 0x2a, + 0x0a, 0x5e, 0xe9, 0xe3, 0xf7, 0x7b, 0x81, 0x52, 0xd5, 0x38, + 0x6d, 0x1f, 0x21, 0x41, 0x82, 0xbf, 0x58, 0x03, 0x94, 0xa0, + 0x89, 0x77, 0xac, 0x02, 0xe5, 0x51, 0x80, 0xd4, 0x37, 0xa7, + 0x90, 0x43, 0x47, 0x8d, 0x54, 0xf1, 0xe3, 0xce, 0x84, 0x28, + 0xb5, 0xb1, 0xef, 0xf2, 0x03, 0x26, 0xb9, 0x61, 0x8c, 0x74, + 0xed, 0x9b, 0xd7, 0x1e, 0xf3, 0x45, 0xe0, 0x8b, 0xa1, 0x8c, + 0xfc, 0xb4, 0x9b, 0x4e, 0x42, 0x92, 0x0c, 0xf2, 0x35, 0xf6, + 0x1b, 0x3f, 0x21, 0xea, 0x88, 0x8c, 0xce, 0x17, 0x1b, 0x99, + 0x13, 0x48, 0xe3, 0x66, 0x6a, 0x2f, 0xf6, 0x9b, 0x18, 0xf3, + 0xa3, 0x1c, 0x8c, 0x11, 0x93, 0xfe, 0x98, 0x85, 0x5c, 0x6e, + 0x84, 0x6e, 0x53, 0xbc, 0x46, 0xd7, 0x99, 0xea, 0xc1, 0xab, + 0x91, 0x61, 0x8f, 0xf5, 0xe6, 0xe7, 0xd1, 0x97, 0x43, 0x44, + 0xe7, 0x3a, 0x3c, 0x24, 0xe6, 0x30, 0x9a, 0x2d, 0x3f, 0x4f, + 0x67, 0x14, 0x73, 0x4c, 0x87, 0x70, 0xb8, 0x98, 0x03, 0xc8, + 0xf1, 0x83, 0x07, 0x93, 0xb4, 0x36, 0xdb, 0xd8, 0xce, 0x39, + 0x0d, 0xc3, 0x29, 0x50, 0x08, 0xe7, 0xb5, 0x51, 0xed, 0xb6, + 0xca, 0x0d, 0x67, 0x45, 0xed, 0x61, 0xdf, 0x39, 0x3a, 0x63, + 0xd6, 0x9a, 0x9c, 0x95, 0x13, 0x58, 0x0b, 0x80, 0x41, 0x4b, + 0x9e, 0x56, 0xd2, 0xb6, 0x81, 0x39, 0x2b, 0x94, 0x48, 0xc3, + 0x81, 0xbe, 0x09, 0x1a, 0xd3, 0x98, 0xb2, 0xfa, 0x47, 0x1b, + 0x26, 0xd8, 0xe2, 0x46, 0xe0, 0x46, 0x6e, 0xb9, 0x6a, 0x54, + 0x37, 0xa5, 0x67, 0x11, 0xfc, 0xca, 0x2b, 0xe3, 0xb6, 0xd1, + 0x4a, 0x99, 0x32, 0x28, 0x0a, 0xdb, 0xfd, 0x6a, 0xb1, 0xab, + 0xd5, 0x93, 0x39, 0x24, 0x0c, 0x7a, 0xb4, 0xb5, 0xe1, 0x1e, + 0x8c, 0x53, 0x65, 0xca, 0x2c, 0x17, 0x63, 0xba, 0x45, 0x00, + 0xdd, 0xbc, 0x60, 0xda, 0x76, 0x20, 0x7b, 0x99, 0xe2, 0xa2, + 0x03, 0x8b, 0xb5, 0x44, 0xa3, 0xae, 0x82, 0x8c, 0xdd, 0xfb, + 0xbd, 0xb3, 0xc6, 0x04, 0xa6, 0x52, 0x6e, 0x37, 0x3f, 0x92, + 0x64, 0x26, 0xc8, 0x0e, 0xd3, 0x8e, 0x71, 0x85, 0x0b, 0x6c, + 0xf2, 0x15, 0x78, 0x0a, 0x44, 0x84, 0xcf, 0xd8, 0x9c, 0x88, + 0x6a, 0x53, 0x1f, 0x74, 0xd5, 0xdd, 0xb1, 0x4a, 0x11, 0x18, + 0xb1, 0x72, 0xc4, 0x7c, 0xe9, 0x83, 0x30, 0x4f, 0xdd, 0xf5, + 0x74, 0xb5, 0x5c, 0xa7, 0x07, 0xfc, 0xe5, 0xd8, 0x1f, 0xf2, + 0x4a, 0x71, 0xe6, 0x8d, 0xf8, 0xbc, 0x91, 0x8e, 0x1d, 0x12, + 0x11, 0xca, 0xf2, 0x17, 0x76, 0x29, 0x03, 0x92, 0x24, 0x17, + 0xba, 0xcc, 0x1f, 0x3e, 0xb3, 0xdb, 0xbf, 0xe4, 0x04, 0xd6, + 0x12, 0xa2, 0xb2, 0x2e, 0x93, 0x89, 0x5e, 0x8f, 0xb6, 0x8c, + 0xf7, 0x80, 0xfc, 0x17, 0xf7, 0x3f, 0x28, 0xaa, 0xc5, 0x8b, + 0xa2, 0x84, 0x9a, 0x9f, 0xde, 0x05, 0x0f, 0xa1, 0x2d, 0x05, + 0x75, 0xed, 0x53, 0x96, 0xbc, 0xba, 0xcd, 0x99, 0xf7, 0xbb, + 0xe0, 0x47, 0x02, 0xb4, 0x4c, 0x10, 0xce, 0xd9, 0x6a, 0x27, + 0xbe, 0x35, 0x4d, 0xf8, 0x5c, 0x2b, 0xfb, 0x82, 0x72, 0x0c, + 0xec, 0x29, 0x95, 0xab, 0xa2, 0x52, 0xe4, 0x54, 0x30, 0x6c, + 0xad, 0xdf, 0x76, 0x29, 0x9c, 0x21, 0x0d, 0x06, 0x61, 0x6d, + 0x99, 0x3b, 0x51, 0xee, 0xb0, 0x47, 0x65, 0x61, 0xf8, 0x4b, + 0xc7, 0xf0, 0x28, 0x7f, 0xc8, 0x24, 0x80, 0x22, 0x6c, 0xc9, + 0x73, 0x39, 0x9b, 0x9f, 0xe5, 0x93, 0x14, 0xc4, 0x7c, 0x91, + 0x35, 0xec, 0x04, 0xa6, 0x5c, 0xaa, 0xe5, 0xfd, 0x35, 0x81, + 0x05, 0x83, 0x54, 0xc4, 0xad, 0x47, 0x0e, 0x60, 0x6c, 0x52, + 0x7d, 0x06, 0x76, 0x9e, 0xe6, 0x91, 0xc3, 0xf4, 0x5b, 0xf1, + 0xad, 0x1a, 0x51, 0xac, 0x49, 0x10, 0xb2, 0x18, 0x25, 0x06, + 0x90, 0x29, 0x46, 0xe3, 0xd5, 0xe1, 0xca, 0x56, 0x1a, 0x6b, + 0x24, 0x52, 0xf8, 0x08, 0xab, 0x52, 0x6c, 0x8d, 0xf7, 0x59, + 0xaa, 0x0a, 0xf5, 0x37, 0x31, 0x64, 0x3b, 0xb7, 0x12, 0x76, + 0xc4, 0xb4, 0x83, 0x9d, 0x76, 0x39, 0x1f, 0x64, 0x85, 0xed, + 0x67, 0x98, 0x97, 0x57, 0x76, 0xe9, 0x0d, 0xc8, 0x4f, 0xc4, + 0xc7, 0xf8, 0xad, 0x1d, 0x6f, 0xf3, 0xae, 0x4a, 0xf0, 0x3d, + 0x71, 0x73, 0xba, 0x74, 0xa7, 0xd0, 0xc7, 0x62, 0x69, 0xb0, + 0x5e, 0x25, 0x87, 0xe8, 0xb7, 0xa5, 0x54, 0x54, 0xe2, 0x7e, + 0x13, 0xd1, 0x6d, 0x70, 0x5b, 0x65, 0x49, 0x7c, 0x40, 0xdd, + 0xba, 0x95, 0x34, 0xc6, 0xa1, 0x01, 0x47, 0x67, 0xa9, 0x42, + 0xde, 0xc7, 0xb1, 0x1e, 0x2e, 0xd6, 0x83, 0x1e, 0x34, 0x30, + 0xd1, 0x76, 0x85, 0x33, 0x41, 0x72, 0xd5, 0x28, 0xc4, 0xdd, + 0x41, 0x78, 0x6a, 0x64, 0xee, 0x45, 0xa1, 0x90, 0x9e, 0xf2, + 0x16, 0xb1, 0x4d, 0xd1, 0xbc, 0x11, 0x2b, 0x0b, 0x19, 0xbf, + 0x7e, 0x63, 0xf1, 0x40, 0xbd, 0x16, 0x96, 0xb3, 0xa7, 0x1e, + 0x0e, 0x8a, 0x92, 0xbb, 0x7d, 0xcd, 0x03, 0x6e, 0x1c, 0x0a, + 0x58, 0x46, 0x88, 0xac, 0x80, 0x63, 0xd5, 0x92, 0xa9, 0xeb, + 0xec, 0x17, 0x9a, 0x0a, 0xe3, 0x75, 0xfe, 0x1f, 0xb8, 0x70, + 0xf5, 0x3c, 0x01, 0x2a, 0x10, 0x9d, 0xd7, 0xef, 0xa7, 0xb2, + 0xb5, 0xab, 0x5a, 0x77, 0x53, 0x61, 0x45, 0x9f, 0x1f, 0xca, + 0x95, 0xe0, 0x83, 0xf7, 0x11, 0xef, 0x52, 0x3d, 0xcc, 0x55, + 0xba, 0x7d, 0x71, 0x49, 0x0c, 0x26, 0x76, 0x89, 0x4c, 0x44, + 0x10, 0xbf, 0x64, 0xad, 0x56, 0x16, 0xb1, 0x83, 0xd0, 0x59, + 0xa5, 0x2a, 0x6a, 0x94, 0x78, 0xd4, 0x0f, 0x78, 0x1b, 0x28, + 0x15, 0xed, 0x05, 0x14, 0x38, 0xbe, 0x67, 0x76, 0x13, 0x83, + 0x6b, 0xca, 0x16, 0xd3, 0x38, 0xf2, 0x57, 0xb7, 0x72, 0x21, + 0xa6, 0x1e, 0x94, 0x3f, 0x61, 0x0b, 0x34, 0x9b, 0x30, 0x39, + 0x4e, 0x3a, 0x89, 0x03, 0x9c, 0x99, 0x2b, 0x9f, 0x28, 0x3f, + 0xde, 0x15, 0xbf, 0xb0, 0xfd, 0x3a, 0xda, 0x58, 0x73, 0x25, + 0x05, 0xab, 0x1d, 0xcf, 0xbf, 0xe5, 0x76, 0x30, 0xb5, 0x75, + 0x22, 0x3e, 0x6b, 0xfb, 0x59, 0x30, 0xe3, 0x3a, 0x62, 0x72, + 0xf8, 0xf5, 0x5a, 0xa7, 0x76, 0xbd, 0xef, 0xa4, 0x7b, 0x7e, + 0xdf, 0x4f, 0x94, 0x52, 0xd7, 0x1d, 0x12, 0xb2, 0x00, 0x56, + 0xa8, 0x55, 0x3c, 0x49, 0x7c, 0x48, 0x10, 0x9b, 0x97, 0x9a, + 0xd7, 0xa3, 0xfa, 0x54, 0xbc, 0xf6, 0x53, 0x56, 0x98, 0x7e, + 0xce, 0xab, 0x5b, 0x85, 0xe2, 0x2f, 0x95, 0x8c, 0x36, 0xbd, + 0x78, 0xe8, 0x13, 0xc5, 0x9b, 0x16, 0x72, 0xea, 0xba, 0x13, + 0x36, 0xfb, 0xda, 0xf2, 0x2c, 0x3f, 0x2d, 0x94, 0x1b, 0xf6, + 0x18, 0x83, 0x8d, 0x06, 0x30, 0xaa, 0x59, 0x52, 0x9b, 0xbc, + 0xbe, 0xe1, 0x79, 0x22, 0xb0, 0xe4, 0xd8, 0x37, 0xad, 0xe9, + 0x86, 0xc6, 0x67, 0x73, 0x1c, 0x2b, 0xda, 0x1f, 0xa5, 0x8e, + 0x48, 0x42, 0x52, 0x55, 0x34, 0x59, 0x12, 0xfa, 0x4f, 0xfb, + 0xb6, 0x6f, 0xb7, 0x9d, 0x7d, 0x2c, 0x0d, 0xc3, 0x2c, 0x25, + 0xd5, 0xb3, 0x90, 0xc6, 0x1a, 0x71, 0x3a, 0x8f, 0xa7, 0x5b, + 0xf4, 0xc7, 0xb0, 0x44, 0x98, 0x4a, 0x05, 0xa2, 0x57, 0x92, + 0x6e, 0xc1, 0xb4, 0xf8, 0xba, 0xd8, 0x47, 0xa4, 0x09, 0xe7, + 0xa1, 0x19, 0x67, 0x45, 0x1a, 0x00, 0x5c, 0x3a, 0x18, 0xf3, + 0x29, 0x45, 0x66, 0x3d, 0xfd, 0xc6, 0xa6, 0x67, 0x2f, 0xa5, + 0xc6, 0xcb, 0x6e, 0xe7, 0x01, 0xab, 0xd3, 0x39, 0xa9, 0x1e, + 0x52, 0x64, 0xd6, 0x30, 0x6c, 0x99, 0x4c, 0x24, 0x5e, 0xed, + 0x75, 0xe7, 0xb3, 0x94, 0x60, 0xf4, 0x75, 0xfe, 0x4f, 0xc1, + 0x6b, 0xe5, 0x2c, 0x11, 0xd3, 0x08, 0xa2, 0x3b, 0x86, 0x48, + 0x7a, 0xb6, 0xbf, 0x91, 0x08, 0x12, 0x71, 0x65, 0xa9, 0xbf, + 0x3e, 0xb8, 0xba, 0x9e, 0x07, 0x70, 0x66, 0xdc, 0xab, 0x92, + 0x88, 0x25, 0x3e, 0x71, 0xdb, 0x90, 0x29, 0x5f, 0xad, 0xa3, + 0x51, 0xc2, 0x82, 0x4c, 0xdb, 0x8f, 0xdf, 0x72, 0x1f, 0xd0, + 0x37, 0x75, 0x01, 0xa3, 0x38, 0xed, 0x36, 0x0a, 0x1b, 0xdf, + 0xa0, 0x1a, 0x60, 0x65, 0x54, 0xc7, 0x65, 0xd2, 0xc5, 0xe2, + 0x78, 0xce, 0xb6, 0x51, 0xe4, 0x84, 0xb0, 0x67, 0x90, 0x85, + 0x80, 0x30, 0xdb, 0x88, 0x17, 0x29, 0x0b, 0x36, 0x5a, 0x40, + 0x85, 0x80, 0xaa, 0x9c, 0xb8, 0x6d, 0x06, 0xc5, 0xd6, 0xcc, + 0xd7, 0x25, 0x83, 0xd1, 0xb2, 0x4d, 0x1e, 0x0d, 0x89, 0x68, + 0x26, 0x07, 0x94, 0x92, 0x97, 0x21, 0xda, 0xa4, 0x7e, 0xc7, + 0x05, 0x10, 0x1a, 0x98, 0x08, 0xae, 0xb4, 0xf3, 0x33, 0x20, + 0xe4, 0xd0, 0x65, 0x94, 0x4c, 0x05, 0xa9, 0xec, 0xec, 0x7f, + 0x76, 0x31, 0xe0, 0x33, 0x33, 0x17, 0x0b, 0x6f, 0xb8, 0x8c, + 0x8e, 0xb3, 0x54, 0x54, 0x39, 0x91, 0xfe, 0x6e, 0x90, 0xf5, + 0x5c, 0x73, 0xbc, 0x2b, 0xb5, 0x1e, 0x7b, 0x2a, 0x7d, 0x83, + 0x80, 0x26, 0x8e, 0x9e, 0x41, 0x5c, 0x30, 0x56, 0x12, 0xf8, + 0xdd, 0xb9, 0xbe, 0xd7, 0x73, 0xa9, 0xba, 0x7f, 0x24, 0xf5, + 0x11, 0xe4, 0xf1, 0x2e, 0xd1, 0x23, 0xc9, 0x4b, 0xa0, 0xfa, + 0x50, 0x49, 0x9c, 0x0a, 0xf5, 0xc0, 0x4d, 0xcf, 0x0a, 0xfd, + 0xa5, 0xcc, 0x2b, 0x4c, 0x62, 0x95, 0x8c, 0xd0, 0x6b, 0x28, + 0x20, 0xf3, 0xdc, 0xec, 0xa4, 0x7b, 0xd6, 0xea, 0xb2, 0x03, + 0xaf, 0x31, 0x68, 0x0f, 0xed, 0xe6, 0x05, 0x67, 0xf2, 0x35, + 0x87, 0xd6, 0xb2, 0xb0, 0x9b, 0x20, 0x1e, 0xf3, 0x1e, 0x0f, + 0x6d, 0x18, 0x91, 0xdd, 0x1f, 0x49, 0xb4, 0x63, 0x8d, 0x92, + 0x01, 0x2d, 0x14, 0x70, 0x81, 0x60, 0x60, 0x85, 0xe5, 0xdf, + 0x84, 0x10, 0x27, 0xfc, 0x83, 0x9e, 0x89, 0x06, 0x79, 0x0c, + 0x38, 0xbe, 0x4d, 0xca, 0xa5, 0xc7, 0x55, 0x82, 0x6c, 0x22, + 0x90, 0x11, 0x6c, 0xc6, 0x74, 0xe7, 0x19, 0x26, 0xb5, 0xc0, + 0x3b, 0x4e, 0x7c, 0x58, 0xf5, 0xd4, 0xce, 0x5e, 0xcd, 0x1b, + 0x8e, 0xb8, 0xe2, 0x17, 0xec, 0xf5, 0xaa, 0x00, 0x58, 0x1d, + 0x6f, 0xa0, 0x2a, 0xf4, 0x59, 0x0a, 0xa2, 0x48, 0xa0, 0x28, + 0xbb, 0x9a, 0x60, 0x84, 0x58, 0x88, 0x0f, 0xdd, 0x2e, 0x59, + 0x6b, 0x0d, 0x10, 0x04, 0x2b, 0x9a, 0x16, 0x5e, 0x2d, 0xed, + 0x87, 0x55, 0x4b, 0xd2, 0x41, 0x4d, 0xfe, 0x1e, 0x7e, 0x44, + 0xa9, 0xbf, 0xdf, 0x61, 0x9f, 0x8c, 0xad, 0x21, 0xfe, 0x03, + 0x84, 0xb5, 0xdc, 0x47, 0x02, 0x33, 0xc5, 0x54, 0xa5, 0xdc, + 0x99, 0x45, 0x3e, 0x20, 0x4d, 0xb3, 0x31, 0x37, 0x64, 0xd7, + 0xab, 0x7b, 0x3e, 0x1b, 0x76, 0xdc, 0x66, 0xa0, 0xa2, 0x9c, + 0x59, 0xbd, 0xe9, 0xcc, 0xe0, 0xae, 0x2a, 0x78, 0xc0, 0x10, + 0x85, 0x5a, 0x3c, 0x35, 0x75, 0x1e, 0xe6, 0x97, 0x68, 0x62, + 0x10, 0xb4, 0x77, 0xdf, 0xc2, 0x81, 0xe7, 0x53, 0x79, 0xdb, + 0x8e, 0xc3, 0x01, 0x19, 0xc3, 0xbc, 0xd9, 0x2c, 0x17, 0xb2, + 0xa9, 0x65, 0x5c, 0x75, 0xee, 0x1a, 0x9a, 0x65, 0x5b, 0x44, + 0xd5, 0xe2, 0xd2, 0xd5, 0x79, 0x27, 0x53, 0x2b, 0x5a, 0xe5, + 0x61, 0x8f, 0x21, 0x52, 0xb7, 0x7a, 0x27, 0x33, 0x94, 0x74, + 0x61, 0x0b, 0x82, 0x72, 0xc6, 0x76, 0xd7, 0x67, 0x1b, 0x18, + 0x88, 0x2a, 0x67, 0xf3, 0xea, 0x45, 0x8a, 0x38, 0x82, 0x61, + 0xbb, 0x02, 0x5b, 0x1f, 0x93, 0xa4, 0xa4, 0x55, 0x3c, 0x83, + 0x67, 0x43, 0xb2, 0x0f, 0xb6, 0x9c, 0x88, 0x9b, 0x4b, 0xa8, + 0x3c, 0xdc, 0xa1, 0x88, 0x08, 0xf7, 0x16, 0x29, 0xef, 0x36, + 0xab, 0x69, 0x09, 0x08, 0xa9, 0x4c, 0xb0, 0x0b, 0x21, 0xc9, + 0xc6, 0x12, 0xf3, 0xfa, 0x2a, 0x92, 0x17, 0xc3, 0xf4, 0xe7, + 0x71, 0x1e, 0xab, 0x4b, 0x78, 0xdd, 0x65, 0xa8, 0xb7, 0x63, + 0xf8, 0xe4, 0x82, 0x5e, 0x21, 0xa9, 0xc8, 0x87, 0x0f, 0x15, + 0xf3, 0xf2, 0x8e, 0x44, 0xf3, 0x73, 0x21, 0xfe, 0x52, 0x43, + 0x2b, 0xe7, 0xae, 0x14, 0x50, 0xaa, 0x3d, 0x07, 0x91, 0x1d, + 0xc1, 0x37, 0xcd, 0xd1, 0x45, 0x36, 0xa1, 0x68, 0xe9, 0x67, + 0x5c, 0x38, 0x9e, 0xcb, 0xf2, 0x09, 0x5c, 0xf8, 0x86, 0xfe, + 0x46, 0x53, 0xcc, 0x96, 0xbb, 0xd6, 0xca, 0xc1, 0x8d, 0x55, + 0xe2, 0xc6, 0x26, 0xba, 0x4f, 0xab, 0x65, 0x0a, 0x65, 0x55, + 0xf1, 0x0d, 0x6c, 0xbe, 0x31, 0x05, 0xab, 0xa1, 0xec, 0x1d, + 0xc9, 0x76, 0xf6, 0x26, 0xc7, 0x15, 0x8a, 0xf9, 0x8c, 0x23, + 0x7a, 0x0b, 0x66, 0xc5, 0xb5, 0x2a, 0xf2, 0x16, 0x84, 0xef, + 0x44, 0x94, 0x4b, 0xcb, 0x18, 0x96, 0x05, 0x44, 0xb8, 0xe3, + 0xa3, 0xe5, 0x5a, 0x98, 0x3b, 0xd3, 0x3f, 0x4a, 0x80, 0xbc, + 0x12, 0x51, 0x92, 0xc6, 0x1d, 0xeb, 0x44, 0x1d, 0x6e, 0x80, + 0xfe, 0x3c, 0x7f, 0x89, 0x6d, 0xaf, 0xc3, 0x71, 0xdb, 0xcf, + 0xc5, 0xa9, 0x81, 0x87, 0xec, 0xc4, 0x61, 0xf4, 0x3e, 0x16, + 0x25, 0x34, 0x0e, 0x15, 0x07, 0xc8, 0xe1, 0x74, 0xc4, 0x7c, + 0x8c, 0xfe, 0xa8, 0x1b, 0xd5, 0xad, 0xbb, 0x46, 0x72, 0x6b, + 0x46, 0x04, 0x90, 0xed, 0x5f, 0x64, 0xfd, 0x23, 0x35, 0xe0, + 0x23, 0xda, 0xa1, 0x76, 0xb0, 0x73, 0xb3, 0x7b, 0xf7, 0xa3, + 0x6a, 0xd2, 0xb8, 0x2f, 0xc2, 0x6b, 0xe3, 0x01, 0xd2, 0x28, + 0xcd, 0xf4, 0xd2, 0xc2, 0x6c, 0x75, 0xec, 0xc1, 0x76, 0x8f, + 0xd4, 0x4c, 0xc0, 0xe0, 0xd6, 0x33, 0x45, 0xbb, 0x21, 0xe7, + 0x1f, 0xf9, 0x9e, 0x66, 0x18, 0x52, 0xfa, 0x35, 0x7b, 0x33, + 0x75, 0x6d, 0x3b, 0x39, 0xf8, 0xce, 0x75, 0x22, 0x57, 0x65, + 0x6a, 0x34, 0x1e, 0xe9, 0x99, 0x66, 0x78, 0xa1, 0xbc, 0xfd, + 0xb7, 0xab, 0x98, 0xa7, 0x4b, 0x62, 0x45, 0x10, 0xc4, 0xf4, + 0xb0, 0xc2, 0xf1, 0x6d, 0xd3, 0x31, 0x19, 0x8b, 0xc4, 0xa3, + 0x36, 0x7e, 0xf4, 0x7f, 0xf4, 0x72, 0xb3, 0xae, 0x75, 0xdb, + 0x5b, 0xf6, 0x5d, 0xe7, 0xf3, 0x23, 0x90, 0xf3, 0x0d, 0x5c, + 0x0d, 0x54, 0x62, 0x51, 0xbc, 0x20, 0xf2, 0x45, 0x10, 0xb6, + 0x3c, 0x93, 0xb6, 0x01, 0x6c, 0x7e, 0x45, 0xc1, 0xad, 0xe2, + 0xe1, 0x1c, 0x15, 0x37, 0x30, 0x1f, 0x90, 0x62, 0x6e, 0x6d, + 0x6a, 0x83, 0xfd, 0xca, 0xd7, 0x76, 0x2b, 0xb6, 0xc5, 0xf9, + 0x1d, 0x54, 0x28, 0xd8, 0xcf, 0xa8, 0xac, 0xf8, 0xc1, 0x54, + 0xf8, 0x5c, 0x23, 0x48, 0x6e, 0xb8, 0x7a, 0x02, 0x72, 0x30, + 0x0e, 0x11, 0x83, 0xb5, 0x2d, 0x83, 0xad, 0xc3, 0x3e, 0xf6, + 0x8f, 0xf3, 0xb6, 0xa0, 0xd7, 0xbb, 0x43, 0xc1, 0x75, 0xfe, + 0x9e, 0x1d, 0x69, 0x2e, 0x83, 0x03, 0x43, 0x43, 0xf6, 0xfa, + 0x43, 0x54, 0xd3, 0xab, 0xfc, 0x53, 0xde, 0x87, 0x92, 0x7b, + 0xc4, 0x3a, 0x5a, 0xe4, 0xee, 0xef, 0x18, 0x96, 0x9c, 0xe0, + 0xea, 0xd1, 0xce, 0x3a, 0x67, 0x56, 0x0c, 0xe8, 0x23, 0x0d, + 0x34, 0x1e, 0x0d, 0x80, 0x03, 0x8f, 0x90, 0xc5, 0xba, 0x01, + 0x43, 0xfb, 0x94, 0x99, 0x3b, 0x19, 0x13, 0x30, 0x5f, 0xd8, + 0x0b, 0xb1, 0x10, 0x8d, 0x61, 0x20, 0x99, 0x3b, 0x2d, 0xbe, + 0x64, 0xf7, 0x7a, 0x97, 0x84, 0x12, 0xc0, 0x44, 0x85, 0x0b, + 0x4b, 0x2c, 0x1c, 0x83, 0x7f, 0x6a, 0xd7, 0x71, 0x6b, 0x5b, + 0x71, 0xf0, 0xc2, 0xbb, 0x9a, 0xfb, 0xad, 0x5c, 0x27, 0x07, + 0x02, 0x5f, 0x54, 0xd7, 0xb5, 0x5f, 0x95, 0xd1, 0x15, 0xb0, + 0x27, 0x1d, 0xda, 0x3f, 0x87, 0x5d, 0x8b, 0x20, 0x2c, 0x12, + 0x6b, 0x11, 0x67, 0xcb, 0xd7, 0xa7, 0xc5, 0xa1, 0x55, 0x91, + 0xed, 0x2a, 0x11, 0xc4, 0x84, 0x1c, 0xab, 0xeb, 0x68, 0x84, + 0x18, 0x94, 0x5b, 0xb1, 0x39, 0x65, 0xca, 0x6c, 0x14, 0xe1, + 0x65, 0xec, 0x18, 0xea, 0x7c, 0x5c, 0x7f, 0x4f, 0xda, 0xe9, + 0x4d, 0x30, 0xaf, 0x96, 0x47, 0x01, 0x7a, 0xfa, 0xd2, 0x97, + 0x37, 0x73, 0xa6, 0xd5, 0x4b, 0x36, 0xb4, 0xda, 0xa7, 0x69, + 0x43, 0x38, 0xa6, 0x59, 0x20, 0x4b, 0xd4, 0x1c, 0x2e, 0xbb, + 0x18, 0xed, 0xd3, 0x3e, 0xef, 0xd2, 0xac, 0xb1, 0xac, 0x51, + 0x23, 0x8b, 0xbc, 0xc3, 0x01, 0x99, 0xd4, 0x2c, 0x6c, 0x33, + 0xf5, 0xe4, 0x3d, 0x3e, 0x6d, 0xae, 0x05, 0x62, 0xeb, 0xb3, + 0xb6, 0x66, 0x64, 0x42, 0x32, 0xba, 0x81, 0xd0, 0xa1, 0x03, + 0xef, 0xc9, 0x4b, 0xee, 0x22, 0x0a, 0x64, 0x08, 0x8a, 0x99, + 0xf1, 0xc0, 0x4e, 0x9e, 0x79, 0x4a, 0xa0, 0x9b, 0xb1, 0xba, + 0x84, 0xbd, 0xe9, 0x5b, 0xec, 0x24, 0x4b, 0x34, 0x5d, 0x2a, + 0xcc, 0xd2, 0x5f, 0xc7, 0x58, 0xe6, 0x47, 0xbb, 0xb9, 0xf7, + 0xb4, 0xc0, 0x1d, 0x2b, 0x21, 0xc0, 0x7d, 0x76, 0xd5, 0xc2, + 0x40, 0xa5, 0xe4, 0x9b, 0x13, 0x0d, 0xb9, 0xb7, 0xb3, 0x2f, + 0xcd, 0xc4, 0xef, 0xef, 0x48, 0x28, 0xe7, 0x87, 0x11, 0x46, + 0xed, 0xac, 0xbf, 0xec, 0x69, 0x30, 0xa7, 0x61, 0x92, 0xcf, + 0x5b, 0xa2, 0x69, 0xe0, 0x14, 0xa2, 0xf9, 0x29, 0xf8, 0xc5, + 0x7e, 0xe2, 0xf0, 0xfc, 0xf0, 0x32, 0xf3, 0x0c, 0xab, 0x7d, + 0xbb, 0xbd, 0x89, 0x54, 0x98, 0x47, 0x6a, 0xb2, 0x87, 0x45, + 0xd6, 0x06, 0xd4, 0xbf, 0x77, 0x95, 0xf4, 0x61, 0x38, 0x96, + 0xad, 0x56, 0xfa, 0x71, 0x9b, 0xde, 0x15, 0x0f, 0xc3, 0x3d, + 0xf5, 0x21, 0x63, 0x9b, 0xfe, 0x70, 0x2d, 0xfe, 0xc2, 0xcf, + 0x26, 0xfd, 0xd6, 0xb6, 0x3e, 0xcc, 0x05, 0x36, 0x11, 0xe2, + 0xb7, 0xdd, 0xc4, 0x6f, 0xc7, 0xac, 0xfe, 0x6f, 0x5b, 0xb4, + 0xe6, 0x47, 0x53, 0x30, 0x69, 0x56, 0x5b, 0x5c, 0xee, 0xc4, + 0xd0, 0x5b, 0x1e, 0x1b, 0xb8, 0xbe, 0xcd, 0xde, 0x28, 0xe6, + 0x90, 0x2d, 0xcf, 0xa0, 0xfb, 0x11, 0x3b, 0xda, 0xba, 0x2e, + 0x05, 0xa8, 0x9e, 0x3b, 0x07, 0xce, 0x87, 0xf0, 0xc4, 0xb4, + 0x2c, 0x33, 0x32, 0xda, 0x5a, 0x86, 0xba, 0x97, 0x1a, 0xf4, + 0xbf, 0xbf, 0x7c, 0x0c, 0xa8, 0xf1, 0x17, 0x99, 0x59, 0xa5, + 0x32, 0xc2, 0x48, 0x2d, 0xa6, 0x02, 0x18, 0xc1, 0x97, 0x95, + 0x7b, 0x6f, 0x62, 0xf8, 0xbb, 0x80, 0x83, 0xb9, 0xe3, 0x27, + 0xdd, 0x38, 0xb0, 0x26, 0xea, 0xde, 0xa5, 0x45, 0xf3, 0x05, + 0xde, 0x51, 0x36, 0x97, 0xc2, 0x80, 0x13, 0x9d, 0xd8, 0x58, + 0xd8, 0xc3, 0xac, 0xaf, 0xfd, 0xf5, 0x19, 0x21, 0x26, 0x9d, + 0xe8, 0xeb, 0xc5, 0x94, 0xd2, 0xb2, 0x56, 0x7e, 0xe6, 0xab, + 0x3c, 0x9d, 0x17, 0xaa, 0x8a, 0x15, 0x6d, 0xd0, 0x01, 0x20, + 0x01, 0xe0, 0x76, 0x44, 0x9c, 0xd0, 0x36, 0x56, 0xbf, 0xc2, + 0x3d, 0xee, 0x5a, 0x11, 0x22, 0xbf, 0x05, 0xf5, 0x98, 0xfe, + 0x88, 0x86, 0x9f, 0x5b, 0x55, 0xce, 0x8b, 0x65, 0x49, 0x24, + 0x13, 0x30, 0x44, 0xb4, 0x32, 0x69, 0x23, 0x13, 0xab, 0xb9, + 0x76, 0x7c, 0x7d, 0xb1, 0x14, 0xdf, 0x35, 0x72, 0x82, 0xbd, + 0xda, 0x6a, 0xa2, 0x91, 0x5c, 0x9d, 0x2a, 0xe0, 0x89, 0x4d, + 0x89, 0xa7, 0xf4, 0x16, 0x7e, 0xc0, 0x74, 0x7e, 0x7d, 0xcc, + 0x2f, 0xf5, 0x63, 0x1a, 0xcb, 0x03, 0x18, 0x17, 0x2a, 0x21, + 0x10, 0x47, 0x83, 0x77, 0x71, 0x2a, 0xdb, 0x82, 0x09, 0xb8, + 0x0e, 0x5c, 0x2d, 0x9d, 0x56, 0x12, 0x20, 0x3d, 0x7c, 0xea, + 0xb1, 0x2f, 0x2a, 0x65, 0xc3, 0xb3, 0xc1, 0xc8, 0x03, 0x74, + 0xe4, 0xe9, 0x24, 0x3a, 0x31, 0xcf, 0x5b, 0x66, 0x3d, 0x23, + 0x1c, 0x39, 0x3b, 0x8b, 0x66, 0x46, 0x0b, 0xcc, 0x57, 0xe7, + 0xb5, 0xe7, 0x35, 0x7c, 0xd0, 0xe2, 0x94, 0x22, 0x33, 0x2a, + 0xb4, 0x01, 0x70, 0x1f, 0x03, 0x8c, 0xde, 0x3b, 0x70, 0x13, + 0x11, 0x1b, 0xe4, 0x22, 0xf3, 0x2b, 0x1f, 0x9d, 0x76, 0x08, + 0xec, 0x7c, 0x88, 0x19, 0xc0, 0x73, 0xef, 0x23, 0x94, 0xda, + 0xa5, 0xa2, 0x2d, 0xf4, 0xd9, 0xc5, 0x60, 0x87, 0x23, 0xb7, + 0x35, 0x29, 0x37, 0x04, 0xae, 0x7f, 0x7c, 0x81, 0x78, 0x49, + 0x30, 0xfc, 0x27, 0x98, 0x83, 0xa9, 0xf2, 0xcc, 0xe3, 0x45, + 0xa1, 0xe2, 0xe9, 0xf2, 0x36, 0x3b, 0x43, 0x8d, 0xe9, 0x39, + 0xa6, 0x2f, 0x00, 0xed, 0xf4, 0x88, 0x61, 0xd7, 0xfc, 0xad, + 0x5a, 0x94, 0x8f, 0x84, 0xc3, 0x38, 0xa0, 0x4e, 0xc3, 0x0d, + 0x30, 0xe1, 0x39, 0x36, 0xc8, 0x86, 0x41, 0xb8, 0x72, 0xd8, + 0x48, 0x4a, 0x08, 0x56, 0xa1, 0xbc, 0x52, 0x94, 0xee, 0x36, + 0x7f, 0xc4, 0xf3, 0x73, 0x59, 0x12, 0x0a, 0x03, 0x4c, 0xbe, + 0x10, 0x96, 0xdd, 0x37, 0xdd, 0x48, 0x41, 0xab, 0xcd, 0xf7, + 0x99, 0xc8, 0xac, 0x23, 0x23, 0x56, 0xba, 0x8b, 0x24, 0x0d, + 0x18, 0x15, 0x6d, 0x5b, 0x88, 0x01, 0x24, 0xe9, 0x99, 0x27, + 0x56, 0x3d, 0xdc, 0xe4, 0xf2, 0x41, 0x2b, 0x65, 0x1f, 0xe0, + 0x84, 0x2f, 0x31, 0xbd, 0x27, 0xd8, 0x0e, 0x99, 0xea, 0xc1, + 0xa9, 0x0b, 0x21, 0x2a, 0xe7, 0x57, 0xc0, 0xa3, 0x79, 0x1f, + 0xf7, 0xbb, 0x4e, 0x50, 0xf2, 0x37, 0x53, 0xed, 0x73, 0xb2, + 0xde, 0x08, 0xef, 0xa5, 0x9e, 0xcd, 0x38, 0x51, 0x55, 0xa1, + 0xf6, 0xac, 0x78, 0xf5, 0xd6, 0xd9, 0x7d, 0x92, 0x34, 0x57, + 0x78, 0xd1, 0x47, 0x17, 0xde, 0x10, 0x7e, 0xce, 0xf8, 0x1c, + 0x09, 0xcc, 0xe0, 0xea, 0x6d, 0x55, 0xec, 0xcb, 0x49, 0xaa, + 0x22, 0x5a, 0x18, 0xe7, 0x10, 0xb2, 0xb5, 0xf0, 0x9d, 0xdf, + 0xc5, 0x49, 0xe8, 0x4d, 0x9d, 0xcd, 0x51, 0x53, 0xf5, 0x95, + 0x37, 0x4b, 0xe4, 0x57, 0xca, 0x3f, 0x55, 0x68, 0x75, 0xc1, + 0xcb, 0xd6, 0x5b, 0xb7, 0x4b, 0x62, 0xb5, 0xce, 0x74, 0x98, + 0x84, 0x01, 0x28, 0xc8, 0x1c, 0x21, 0xc2, 0xc5, 0xa8, 0xdf, + 0x69, 0xd8, 0xd5, 0xe2, 0x4d, 0x72, 0xc1, 0xd5, 0x2f, 0x13, + 0x94, 0x6d, 0x1c, 0xc6, 0x46, 0xdf, 0x70, 0x57, 0xdd, 0x9b, + 0xb8, 0x90, 0x37, 0xad, 0xe9, 0xf0, 0x09, 0x1f, 0xfb, 0xb1, + 0x99, 0x6e, 0xa5, 0xe7, 0x9f, 0x94, 0xf1, 0xd4, 0xbd, 0x85, + 0x13, 0xfd, 0xbf, 0x18, 0x86, 0xc2, 0xb7, 0x95, 0xa9, 0x4c, + 0x26, 0x2f, 0x54, 0xcf, 0xc1, 0x64, 0xf6, 0x85, 0x13, 0xc9, + 0xd7, 0xd2, 0xef, 0xd7, 0xf0, 0x38, 0xf9, 0x19, 0x10, 0x88, + 0xd6, 0x13, 0x93, 0xc5, 0xb4, 0xc3, 0x58, 0x2e, 0x38, 0xbf, + 0xa9, 0xf7, 0x7d, 0xbc, 0x6d, 0xb6, 0x08, 0x3e, 0xd4, 0xab, + 0x7b, 0xd1, 0xe0, 0x68, 0x5c, 0xc7, 0xa8, 0x6a, 0xab, 0x9c, + 0x86, 0xe9, 0x0a, 0x7d, 0x71, 0x92, 0xdb, 0xd3, 0xa3, 0x8f, + 0x0e, 0x5c, 0xad, 0x86, 0xac, 0x5f, 0x9d, 0xd6, 0x75, 0x3b, + 0xf3, 0x64, 0x61, 0x27, 0xa5, 0x10, 0x74, 0x3f, 0x30, 0xce, + 0xa3, 0x40, 0x22, 0xb4, 0x29, 0xa3, 0x9b, 0x94, 0x10, 0xe5, + 0xc0, 0x8f, 0x7f, 0xe3, 0x51, 0x13, 0x93, 0x2f, 0xb2, 0xba, + 0x4e, 0xb8, 0x6e, 0xc2, 0x43, 0x25, 0xb6, 0xb1, 0x50, 0xe2, + 0xe6, 0x8a, 0xbe, 0x1c, 0xc6, 0x0a, 0x49, 0xcd, 0x87, 0xa4, + 0x36, 0x8b, 0xcf, 0x8c, 0xa1, 0xf3, 0x69, 0x3a, 0x73, 0x04, + 0x9d, 0xde, 0xf1, 0x8e, 0x7d, 0x62, 0x43, 0xf5, 0x38, 0x4b, + 0x81, 0x1f, 0xb7, 0xac, 0x06, 0x29, 0x08, 0xd3, 0x06, 0xd0, + 0x67, 0x7f, 0xf4, 0x25, 0x53, 0x54, 0x20, 0x2e, 0xdf, 0x6b, + 0x33, 0x03, 0x64, 0x04, 0x36, 0xd0, 0x25, 0x02, 0x2a, 0x6e, + 0xa6, 0xfc, 0xd5, 0x36, 0x6b, 0x55, 0x6c, 0x9f, 0x43, 0x34, + 0x29, 0xc7, 0xe9, 0xe2, 0xe4, 0x9a, 0x28, 0xbb, 0x40, 0x8d, + 0x4d, 0x99, 0x13, 0x84, 0x71, 0x31, 0x6d, 0xaa, 0x99, 0x34, + 0xb0, 0x9a, 0x7d, 0x16, 0xe6, 0xdf, 0x83, 0xaf, 0xbd, 0xf8, + 0x77, 0x88, 0xb1, 0x4b, 0x18, 0xa8, 0x62, 0xe2, 0xb5, 0xdb, + 0x17, 0x31, 0x47, 0xeb, 0xca, 0x9e, 0x9e, 0xe7, 0x19, 0x99, + 0x09, 0x00, 0x9a, 0x40, 0x55, 0x38, 0xc4, 0xea, 0xd7, 0xbf, + 0xae, 0x00, 0x95, 0x8b, 0x78, 0x01, 0x60, 0x01, 0xf9, 0xa3, + 0xac, 0x49, 0xc8, 0xdd, 0xc5, 0x78, 0xc2, 0x69, 0xf2, 0x2c, + 0x2b, 0x7a, 0x2f, 0x01, 0x09, 0x13, 0xe9, 0x85, 0x3e, 0xb9, + 0x63, 0xd9, 0x48, 0xea, 0xd1, 0x9e, 0x21, 0xda, 0x4b, 0x26, + 0xc7, 0xc9, 0xfd, 0xb5, 0x42, 0x37, 0x24, 0x1a, 0x97, 0x51, + 0x8c, 0x20, 0x51, 0xfe, 0x0d, 0x14, 0x0c, 0xc8, 0x50, 0x74, + 0xb5, 0x1d, 0xba, 0x8c, 0xd5, 0x8a, 0x80, 0xf4, 0x1e, 0x37, + 0xf2, 0x6a, 0x3c, 0x31, 0xf3, 0x90, 0xb4, 0x0b, 0xab, 0x7f, + 0x44, 0x2c, 0xe1, 0x09, 0x25, 0x42, 0xf9, 0x81, 0xd4, 0x49, + 0x4c, 0x5b, 0xd4, 0xc1, 0x14, 0x5e, 0x5e, 0x3c, 0xdc, 0xe8, + 0x7d, 0xee, 0x39, 0xe6, 0x10, 0x78, 0x7e, 0xf6, 0xb5, 0xb2, + 0xf1, 0x32, 0xac, 0xa2, 0x48, 0xc4, 0xc6, 0xbe, 0x0d, 0xeb, + 0x8a, 0x53, 0xf9, 0x0a, 0x6f, 0x50, 0x71, 0x36, 0x10, 0x88, + 0xc5, 0x8c, 0xda, 0xb9, 0x99, 0xe3, 0x56, 0xb6, 0xad, 0x59, + 0x1e, 0x42, 0x72, 0x97, 0x53, 0x75, 0x2f, 0x4e, 0x08, 0x43, + 0x53, 0xb1, 0x02, 0xbc, 0xd7, 0xc9, 0x13, 0x7a, 0x05, 0x84, + 0x8c, 0x0a, 0xf3, 0x62, 0x11, 0x5d, 0x0d, 0x56, 0x2a, 0x5a, + 0x7e, 0x05, 0x62, 0x57, 0xe5, 0x05, 0xb7, 0x5a, 0xf7, 0x69, + 0x39, 0xd7, 0xf2, 0x0d, 0x93, 0x82, 0x0c, 0x9a, 0xa4, 0xc8, + 0x77, 0x6c, 0xe3, 0x60, 0x21, 0x6c, 0x1b, 0x5e, 0xe2, 0xf1, + 0xb6, 0xbb, 0x69, 0x21, 0xca, 0xf6, 0xf0, 0x5f, 0xb4, 0xdb, + 0xeb, 0x20, 0x47, 0xa9, 0x82, 0x11, 0x0d, 0x72, 0xde, 0xd1, + 0xa4, 0xda, 0x6c, 0xfa, 0xaa, 0xe9, 0x2b, 0x30, 0xab, 0x6a, + 0x83, 0x48, 0xea, 0xfd, 0xf9, 0xaa, 0x76, 0x93, 0xe6, 0xe2, + 0x25, 0xb0, 0x6d, 0x4f, 0x11, 0xd4, 0xee, 0x9e, 0x6d, 0x72, + 0x20, 0x00, 0x8e, 0x90, 0x62, 0x0a, 0x57, 0x57, 0x08, 0x1f, + 0xfd, 0x24, 0x64, 0x69, 0x6e, 0x8e, 0x76, 0x8c, 0xbb, 0x8d, + 0x42, 0x8d, 0xd9, 0x94, 0xbf, 0xf2, 0x20, 0x6e, 0xcd, 0xca, + 0xdd, 0x0a, 0x4b, 0x5c, 0xb7, 0xa7, 0x36, 0x8f, 0xea, 0x51, + 0x2c, 0x3f, 0x59, 0x66, 0xa1, 0x8a, 0x8b, 0xba, 0x4c, 0x6c, + 0xc3, 0x63, 0x5b, 0x6d, 0x29, 0x2f, 0xc2, 0xd1, 0x99, 0x34, + 0x23, 0xf0, 0x81, 0x5c, 0xf7, 0x6e, 0xea, 0x84, 0x65, 0x9c, + 0x7f, 0x88, 0x1e, 0x8c, 0xf9, 0xaf, 0x57, 0x52, 0x58, 0x4a, + 0xd9, 0x3b, 0x6a, 0xf1, 0x02, 0xa6, 0xe0, 0xfe, 0x9b, 0xdf, + 0x7e, 0xa1, 0x74, 0x96, 0x37, 0x61, 0x95, 0x46, 0x81, 0xb2, + 0x37, 0x50, 0x47, 0x57, 0xf3, 0xc7, 0x07, 0xb1, 0x54, 0xe0, + 0x39, 0x0c, 0x48, 0xd7, 0x59, 0x17, 0x37, 0x0c, 0x08, 0x1b, + 0x84, 0xc7, 0xc1, 0xad, 0xdc, 0x3a, 0xd8, 0x0f, 0x77, 0x71, + 0x25, 0x92, 0x23, 0x1f, 0x51, 0xe6, 0x8d, 0x94, 0x8e, 0xc8, + 0xbe, 0x31, 0xbf, 0xce, 0x30, 0xc9, 0x3e, 0xe0, 0x65, 0x71, + 0xa2, 0x50, 0xb7, 0xab, 0x1a, 0x81, 0x3c, 0x8e, 0x3c, 0xe4, + 0xc3, 0xaa, 0x16, 0xe4, 0x0b, 0x41, 0xdd, 0x7f, 0x76, 0x06, + 0xad, 0x82, 0x19, 0x6e, 0x37, 0x9d, 0xc4, 0x3c, 0x26, 0x3a, + 0xa1, 0x13, 0x93, 0xbc, 0x9b, 0x6a, 0xd1, 0xb1, 0x03, 0x75, + 0xbd, 0x3a, 0x9d, 0x6c, 0x0f, 0xe3, 0x78, 0xf6, 0x12, 0x73, + 0x29, 0xef, 0x10, 0xdf, 0xfb, 0xfb, 0x58, 0xd6, 0xd8, 0x14, + 0xd7, 0xa2, 0xdb, 0x5f, 0x71, 0x49, 0x98, 0x5d, 0x3f, 0xb5, + 0xed, 0xc3, 0x1e, 0x77, 0xec, 0x3f, 0x5e, 0x99, 0x15, 0x41, + 0xc5, 0xb0, 0x82, 0xe5, 0x1c, 0x81, 0x85, 0x9e, 0xf1, 0x5a, + 0x58, 0x80, 0xb2, 0x27, 0x2b, 0x19, 0xb7, 0xf4, 0x29, 0x79, + 0x01, 0x41, 0x3e, 0x9c, 0x7a, 0x76, 0x49, 0x53, 0xaa, 0x8c, + 0xd1, 0x2f, 0xc5, 0x72, 0xdb, 0x2a, 0x4b, 0x2b, 0x9e, 0x76, + 0x20, 0x08, 0xfd, 0x3e, 0xb9, 0x97, 0xda, 0x6c, 0xdb, 0xf1, + 0x48, 0x6a, 0x3e, 0xbd, 0x8a, 0x15, 0x22, 0xbc, 0x13, 0x67, + 0xe1, 0xd4, 0x1a, 0xa3, 0xc1, 0xdb, 0xa4, 0x54, 0x16, 0x05, + 0x79, 0x13, 0x2d, 0x8e, 0x7c, 0x85, 0xc9, 0x4e, 0x4b, 0x00, + 0xc7, 0xc5, 0x7f, 0x47, 0xf0, 0xee, 0x3a, 0x36, 0x8f, 0x96, + 0xfe, 0x2a, 0x77, 0x48, 0x45, 0x3b, 0x52, 0x8d, 0x6d, 0x21, + 0x3f, 0x40, 0x52, 0x22, 0x83, 0x26, 0x06, 0xd3, 0x81, 0x53, + 0x18, 0x4f, 0xeb, 0xc8, 0x20, 0x1f, 0xc8, 0xab, 0x7b, 0x40, + 0x88, 0x9b, 0x6e, 0x5b, 0x96, 0x26, 0x64, 0xbf, 0x35, 0x33, + 0x06, 0x16, 0x7a, 0xf5, 0x43, 0xd1, 0xa7, 0xb9, 0x4c, 0x11, + 0x91, 0x6a, 0x52, 0x21, 0xa0, 0x1a, 0x23, 0x08, 0x64, 0x51, + 0x43, 0x3a, 0xb9, 0xf2, 0xa6, 0xa2, 0xc8, 0x46, 0xf1, 0xd0, + 0x8f, 0x0d, 0xe0, 0xd0, 0x1a, 0xf8, 0xcd, 0xcf, 0xaa, 0xaa, + 0x81, 0xa5, 0x69, 0xab, 0x53, 0xd6, 0xfb, 0x63, 0xe0, 0x9e, + 0xc6, 0x27, 0xc6, 0xb1, 0xf6, 0x49, 0x96, 0xea, 0x80, 0x19, + 0xf9, 0xe4, 0xb6, 0xec, 0x6b, 0x77, 0x14, 0xee, 0x93, 0xa3, + 0xa1, 0xd9, 0xac, 0x83, 0x63, 0xa1, 0x31, 0x1d, 0x5e, 0x8b, + 0x86, 0x1f, 0xa2, 0xc0, 0x97, 0x3a, 0x4a, 0xeb, 0xc3, 0xbb, + 0x1a, 0x30, 0xd3, 0xcd, 0x75, 0x8c, 0x93, 0x66, 0x3b, 0xdb, + 0x93, 0xfa, 0xf1, 0x62, 0xeb, 0xd7, 0x81, 0xb5, 0xe4, 0xba, + 0xdc, 0x7a, 0x11, 0x4c, 0x79, 0x6c, 0xe5, 0xdd, 0x69, 0x71, + 0x77, 0xf0, 0x4f, 0x90, 0x4b, 0x00, 0xa5, 0xd3, 0xe7, 0xba, + 0xad, 0x32, 0x5b, 0x34, 0x07, 0x0a, 0x70, 0x1f, 0xe1, 0xd8, + 0xb0, 0x9b, 0xc2, 0x35, 0x9c, 0x8c, 0x2b, 0xe2, 0xdd, 0x31, + 0x48, 0xcd, 0xf1, 0x8b, 0x18, 0xfa, 0x21, 0x9a, 0x2a, 0x3e, + 0xcd, 0xeb, 0x5e, 0x06, 0xfc, 0xf6, 0x9e, 0x41, 0x5f, 0xc8, + 0x7b, 0x13, 0xaf, 0x91, 0xac, 0xe6, 0xe9, 0xcd, 0xc1, 0x7c, + 0x84, 0x81, 0xaf, 0x75, 0x0a, 0xe9, 0x30, 0x6d, 0xfa, 0x02, + 0x2a, 0x1f, 0x13, 0x2e, 0x2d, 0x34, 0x7b, 0x87, 0x6c, 0x5a, + 0x43, 0xaa, 0x7d, 0x25, 0xd0, 0xc3, 0xcf, 0x32, 0x8b, 0x3f, + 0x28, 0xb5, 0x0b, 0xa8, 0x7d, 0x06, 0x8e, 0x04, 0x39, 0xd0, + 0x96, 0x6a, 0xa2, 0x26, 0x64, 0xe1, 0xdc, 0xcb, 0xcc, 0xd5, + 0xfe, 0x33, 0xef, 0x9c, 0xba, 0xdb, 0xb8, 0xe1, 0xf2, 0x2a, + 0xb8, 0x5c, 0x50, 0xb8, 0x2a, 0xc8, 0x82, 0xc0, 0x1c, 0xc9, + 0x9b, 0xca, 0x53, 0x0a, 0x52, 0x3d, 0xbe, 0x23, 0x71, 0xdc, + 0x5c, 0xf8, 0x41, 0x6e, 0xb1, 0x00, 0x17, 0x96, 0xec, 0xdc, + 0xfb, 0xac, 0x61, 0xb4, 0x62, 0xcd, 0x97, 0x5e, 0xc9, 0x82, + 0xf8, 0x00, 0xc7, 0x50, 0x2c, 0xf5, 0xe5, 0x6b, 0xd8, 0xe1, + 0x2c, 0x65, 0x50, 0x46, 0xbd, 0x1e, 0x4f, 0xe3, 0x5c, 0x3b, + 0x90, 0x22, 0xf4, 0x24, 0x4f, 0x4f, 0x8f, 0x08, 0x3a, 0x8e, + 0x02, 0x9c, 0x99, 0xf9, 0xe4, 0x0b, 0x34, 0x37, 0x66, 0x0b, + 0x29, 0x36, 0xa6, 0xde, 0x8a, 0x9d, 0x21, 0x2b, 0xab, 0x6a, + 0x0f, 0xe3, 0x55, 0x2c, 0xb9, 0xa7, 0x35, 0x68, 0x5e, 0xf4, + 0xaf, 0x38, 0x66, 0x1d, 0x2a, 0xcc, 0xe9, 0x26, 0x7a, 0x72, + 0xb4, 0x16, 0xde, 0x98, 0x4a, 0xe0, 0x01, 0xa4, 0x36, 0x11, + 0x00, 0x26, 0xc6, 0x08, 0xeb, 0x7e, 0xfd, 0x2f, 0x1a, 0xe7, + 0x8d, 0x17, 0xe2, 0xb4, 0xf3, 0x41, 0x53, 0xc5, 0x56, 0x64, + 0x92, 0x1f, 0xcf, 0x7f, 0xf2, 0xf6, 0x2c, 0x06, 0x10, 0x5e, + 0x21, 0xd1, 0xc6, 0x22, 0x66, 0x66, 0x5b, 0x70, 0x97, 0xbc, + 0xbf, 0x9f, 0x16, 0x92, 0x95, 0x66, 0xfb, 0xe3, 0x8b, 0xf6, + 0x36, 0xb7, 0x53, 0x8a, 0xd3, 0xbe, 0xa3, 0x3b, 0xa4, 0xc3, + 0x10, 0xc0, 0x7c, 0x28, 0xea, 0x7c, 0xa7, 0x35, 0x79, 0x0a, + 0x67, 0xe4, 0x47, 0xa0, 0x13, 0xea, 0xe8, 0x2e, 0x36, 0xbe, + 0xae, 0x83, 0x5f, 0xdc, 0xad, 0x35, 0x7b, 0xcd, 0x6d, 0x44, + 0x35, 0x40, 0xd7, 0xbf, 0x3a, 0xd0, 0xb9, 0x3d, 0xdc, 0xed, + 0x14, 0x4b, 0xcd, 0x34, 0x2c, 0x06, 0xd9, 0x61, 0x18, 0x80, + 0xe1, 0x06, 0x41, 0xb1, 0x9e, 0x46, 0x44, 0xa5, 0xb0, 0xaf, + 0xc9, 0x26, 0xa0, 0x7a, 0xca, 0xf5, 0x8a, 0xee, 0x37, 0xa6, + 0xc2, 0xd2, 0x04, 0x12, 0xc7, 0x1e, 0x0c, 0x69, 0x35, 0x61, + 0xe3, 0x61, 0x73, 0xdf, 0xad, 0xe8, 0x3d, 0x3b, 0xaa, 0xc2, + 0x19, 0x77, 0x84, 0xb8, 0x3d, 0x6d, 0x2b, 0xb5, 0xcf, 0xd9, + 0x8f, 0x75, 0x33, 0xc3, 0x04, 0xb5, 0xa4, 0xf8, 0xcc, 0xea, + 0x1b, 0x41, 0x11, 0x18, 0x09, 0x9d, 0xf4, 0x64, 0x8b, 0x59, + 0xee, 0x9e, 0xf8, 0x72, 0xdf, 0x97, 0x1d, 0xca, 0x4e, 0x55, + 0xca, 0x2b, 0xe0, 0x95, 0x9c, 0x0e, 0x91, 0xf5, 0xbf, 0x76, + 0x58, 0x57, 0xbc, 0xa0, 0xc0, 0xd1, 0x01, 0xef, 0x8f, 0x3b, + 0x21, 0x0d, 0x18, 0x4d, 0x1f, 0x98, 0x59, 0x2b, 0x94, 0xc4, + 0x0b, 0xb6, 0x26, 0x93, 0xcc, 0xe5, 0x43, 0x5c, 0xf5, 0xb5, + 0x48, 0x4d, 0x1e, 0xdd, 0x6c, 0x3f, 0x65, 0x5c, 0xb1, 0x00, + 0x62, 0x79, 0xf2, 0xf9, 0xaa, 0xb5, 0xbc, 0x36, 0x17, 0x0d, + 0x80, 0xc3, 0xcc, 0xc3, 0x13, 0x17, 0x5b, 0x90, 0xbc, 0xe8, + 0x6b, 0xbd, 0x9e, 0x8d, 0x15, 0x23, 0xd8, 0x2a, 0x54, 0x37, + 0x2d, 0x04, 0x3c, 0xeb, 0xe4, 0x22, 0x7c, 0x2f, 0x14, 0x8e, + 0xca, 0xbc, 0xbd, 0x8a, 0xb2, 0xd3, 0x85, 0xb4, 0xd3, 0x88, + 0x96, 0x9e, 0x45, 0xdb, 0xd0, 0xd9, 0xf4, 0x8e, 0x8a, 0x80, + 0x7b, 0xb0, 0xfb, 0x2a, 0xed, 0x9d, 0xb5, 0x96, 0xac, 0x96, + 0xcf, 0x0d, 0xfb, 0x82, 0x38, 0x41, 0xbb, 0x48, 0x38, 0x59, + 0x03, 0xb8, 0x41, 0xac, 0xbb, 0x91, 0x82, 0x9c, 0x2c, 0x17, + 0x8b, 0xe2, 0x71, 0x51, 0xdd, 0xe1, 0x14, 0x36, 0x7a, 0xc7, + 0xb0, 0x19, 0x5c, 0x75, 0xca, 0x6a, 0xf4, 0x65, 0x9e, 0xab, + 0xfb, 0x9e, 0xaf, 0x7b, 0x87, 0x96, 0xf5, 0xbf, 0x42, 0xf9, + 0x26, 0x69, 0xfd, 0xfb, 0x52, 0x11, 0x81, 0xb0, 0x52, 0x4a, + 0x24, 0xa8, 0x2d, 0xa0, 0x31, 0x2b, 0xcf, 0x7a, 0x0b, 0xd2, + 0x5b, 0x21, 0xfb, 0x20, 0x28, 0xac, 0x68, 0xd1, 0x11, 0x4e, + 0xe0, 0x35, 0x3c, 0xad, 0x51, 0x56, 0xdc, 0x5c, 0x8d, 0x8e, + 0x13, 0x87, 0xbc, 0x96, 0x3c, 0x03, 0x5f, 0xb2, 0x93, 0xbb, + 0x8c, 0x37, 0x9d, 0xb2, 0x0b, 0x87, 0x26, 0x07, 0x32, 0x43, + 0x73, 0xcb, 0xfc, 0x5c, 0xa6, 0x5d, 0x58, 0xa5, 0x3d, 0xe8, + 0x03, 0xfc, 0x6a, 0xdf, 0xe0, 0x67, 0x85, 0x05, 0xd1, 0x44, + 0x45, 0xbf, 0x2d, 0xd5, 0xa6, 0xd1, 0x7d, 0xa7, 0x5a, 0x82, + 0x44, 0x07, 0x02, 0x70, 0x07, 0x08, 0x86, 0x37, 0x23, 0x98, + 0x50, 0x54, 0x95, 0x99, 0x68, 0xa8, 0x3d, 0xb6, 0x23, 0x5c, + 0x7c, 0x38, 0x74, 0x77, 0x6b, 0xc4, 0xcf, 0xda, 0x6b, 0x7a, + 0x19, 0x20, 0xe1, 0x29, 0xdb, 0xf6, 0xe2, 0x3e, 0x43, 0x56, + 0x87, 0xeb, 0x12, 0x83, 0xb6, 0x0a, 0xcc, 0x20, 0x68, 0xdf, + 0x5b, 0xf8, 0xe0, 0x35, 0x65, 0x2e, 0xb8, 0xce, 0xa2, 0xa0, + 0x7d, 0xa0, 0x58, 0x36, 0xa6, 0x97, 0x4d, 0x3f, 0x55, 0x3f, + 0x30, 0x6f, 0x66, 0x06, 0x2f, 0x22, 0xd3, 0x6c, 0xab, 0x52, + 0x81, 0x51, 0x88, 0xdb, 0x9c, 0x3c, 0x2d, 0xce, 0xe0, 0x67, + 0x98, 0x00, 0x6e, 0xbb, 0xb7, 0xc6, 0x82, 0xb2, 0x7b, 0x41, + 0x41, 0xe6, 0x62, 0x99, 0x84, 0xe5, 0x58, 0xc5, 0xc9, 0xf0, + 0x5b, 0xc6, 0x2b, 0x2b, 0x75, 0x1d, 0xfe, 0x5b, 0x86, 0x9a, + 0x09, 0xdd, 0x60, 0x39, 0x50, 0xa0, 0xdb, 0xc5, 0x32, 0x7e, + 0x54, 0xc5, 0xad, 0xde, 0x65, 0x37, 0x1e, 0xf2, 0x71, 0x05, + 0x9a, 0x24, 0x24, 0xbb, 0x3f, 0x94, 0x6c, 0x7e, 0x46, 0x54, + 0x05, 0x8f, 0x49, 0x48, 0x29, 0x5e, 0x1a, 0x2b, 0x28, 0xca, + 0xbf, 0xb2, 0x56, 0x97, 0x01, 0x18, 0x7a, 0xf1, 0x22, 0xb6, + 0x72, 0x88, 0x76, 0xb2, 0x72, 0x70, 0x47, 0x6e, 0x74, 0x19, + 0x36, 0x76, 0x50, 0x99, 0x7e, 0x8e, 0x61, 0x54, 0x2c, 0x55, + 0xef, 0xf7, 0x27, 0x62, 0x2f, 0x7a, 0xa6, 0x0b, 0x33, 0xb3, + 0x21, 0xec, 0xde, 0xaf, 0x1e, 0x5d, 0x30, 0x42, 0x82, 0xd9, + 0x59, 0x5e, 0xc9, 0xcf, 0x5a, 0x5b, 0xf5, 0xf6, 0x77, 0xdf, + 0xc1, 0x55, 0x6d, 0x5a, 0x6f, 0x54, 0x9a, 0x1f, 0x2d, 0xc2, + 0x19, 0xe1, 0x54, 0x91, 0xe3, 0x1d, 0x01, 0xae, 0xcb, 0xd0, + 0x73, 0x99, 0x0f, 0x6a, 0x7d, 0x3a, 0x88, 0x81, 0x90, 0xf6, + 0x15, 0xb9, 0x80, 0xf8, 0x2f, 0x4b, 0x39, 0x85, 0x17, 0xc5, + 0xf9, 0x76, 0x1d, 0x35, 0x2e, 0xbf, 0x68, 0x12, 0x6a, 0xb2, + 0xee, 0xc2, 0x4f, 0xa0, 0x1b, 0xa8, 0xad, 0xb9, 0x58, 0x3f, + 0x3b, 0xbb, 0x38, 0x21, 0x29, 0xa4, 0xf5, 0x53, 0xc7, 0x10, + 0xb7, 0x64, 0x52, 0xa4, 0x8b, 0xda, 0x22, 0x0d, 0xc5, 0xbc, + 0xb0, 0xb5, 0xea, 0x3c, 0x15, 0x6e, 0x18, 0x3a, 0x03, 0x92, + 0x49, 0xd5, 0x07, 0xdc, 0x30, 0x93, 0xb8, 0x8c, 0x3b, 0x38, + 0xc4, 0x96, 0xef, 0x7f, 0xb6, 0x6f, 0x0a, 0xe4, 0x8e, 0x4c, + 0xbb, 0x15, 0x87, 0x2a, 0xa5, 0x97, 0xfe, 0xb1, 0xdf, 0x65, + 0xb4, 0xed, 0x9b, 0x1d, 0x7e, 0x6c, 0xa4, 0xa5, 0x20, 0x8f, + 0x09, 0xd6, 0x14, 0x33, 0xad, 0xcd, 0x25, 0xe8, 0x3b, 0x74, + 0x8e, 0x3b, 0x67, 0x07, 0x0f, 0x2b, 0x19, 0x35, 0x10, 0x98, + 0xab, 0x33, 0xaa, 0x8f, 0xa6, 0xf3, 0x9a, 0xc2, 0x89, 0x94, + 0xfd, 0x72, 0xb2, 0x94, 0x64, 0x77, 0x74, 0x84, 0xf7, 0x0a, + 0x47, 0xc9, 0x14, 0x5c, 0xe0, 0x19, 0xc6, 0xa8, 0xf0, 0xb9, + 0x13, 0x98, 0x11, 0xae, 0x76, 0x85, 0xc8, 0x38, 0xda, 0xe8, + 0x55, 0xb8, 0x1f, 0xe1, 0xf0, 0xfe, 0x77, 0x1c, 0x53, 0xb3, + 0xe9, 0x57, 0x1c, 0x2b, 0x08, 0x89, 0xe1, 0x88, 0x13, 0x24, + 0x57, 0x83, 0x9c, 0x85, 0xd1, 0x14, 0xf8, 0xf9, 0xdd, 0x29, + 0x30, 0x73, 0xae, 0xcd, 0xd0, 0x2a, 0xec, 0xa7, 0x44, 0x75, + 0x8a, 0x38, 0x3e, 0xd3, 0x5b, 0x24, 0x2b, 0xe0, 0x5a, 0x85, + 0x39, 0x8c, 0x0c, 0x51, 0x9b +}; + diff --git a/neureka/pointwise/src/output.c b/neureka/pointwise/src/output.c new file mode 100644 index 0000000..0172aeb --- /dev/null +++ b/neureka/pointwise/src/output.c @@ -0,0 +1,165 @@ +#include "output.h" + +#define OUTPUT_SIZE (1365) +PI_L1 uint8_t output[OUTPUT_SIZE]; + +#define GOLDEN_OUTPUT_SIZE (1365) +PI_L2 uint8_t golden_output[GOLDEN_OUTPUT_SIZE] = { + 0xcd, 0x00, 0x00, 0x00, 0x63, 0x15, 0x00, 0x00, 0x00, 0x00, + 0x36, 0x6f, 0x00, 0x00, 0x27, 0x6d, 0x00, 0x2a, 0xff, 0x31, + 0xff, 0x8f, 0x00, 0x00, 0x01, 0x64, 0x55, 0x16, 0xb8, 0x0a, + 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x77, + 0x00, 0x1b, 0x00, 0x9b, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x30, + 0x60, 0x9b, 0x37, 0x1b, 0xff, 0x00, 0x00, 0xff, 0x00, 0xff, + 0xbf, 0x00, 0x00, 0x00, 0xff, 0x3d, 0x22, 0x00, 0x00, 0x00, + 0x00, 0x4e, 0x00, 0x00, 0x45, 0xe0, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3b, 0xff, + 0x00, 0x28, 0x00, 0xff, 0x00, 0x00, 0xff, 0x77, 0xf8, 0x4b, + 0x00, 0x00, 0x00, 0x8d, 0x37, 0x1c, 0x65, 0x00, 0x05, 0x00, + 0x8d, 0x00, 0x01, 0x50, 0xce, 0x00, 0x00, 0x8c, 0x00, 0x93, + 0x00, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3b, 0x2f, 0x6e, + 0x21, 0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x4b, 0xcd, 0x00, + 0x00, 0x00, 0xff, 0x44, 0x0c, 0x5d, 0x00, 0x2f, 0x00, 0x7c, + 0x00, 0x00, 0x1d, 0xf9, 0x00, 0x00, 0x5b, 0x00, 0xd8, 0x00, + 0xf6, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x33, 0x34, 0x00, 0x21, + 0x19, 0xe8, 0x95, 0x4e, 0xff, 0x00, 0xff, 0x9e, 0x00, 0x00, + 0x00, 0x9b, 0x41, 0x23, 0x47, 0x2c, 0x00, 0x00, 0x7d, 0x00, + 0x00, 0x00, 0x49, 0x00, 0x00, 0xee, 0x00, 0xee, 0x00, 0x86, + 0x71, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x2d, 0xe3, 0x7a, 0x53, + 0xdc, 0x74, 0x14, 0xff, 0x66, 0xff, 0xcc, 0x13, 0x00, 0x00, + 0xce, 0x2e, 0x15, 0x93, 0x09, 0x24, 0x00, 0x70, 0x00, 0x00, + 0x00, 0xe9, 0x0e, 0x00, 0x3a, 0x00, 0x25, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x34, 0x93, 0xea, 0xc3, 0x59, 0x8a, + 0x00, 0xbb, 0xda, 0x3e, 0xff, 0xc3, 0x00, 0x00, 0x00, 0xff, + 0x4a, 0x12, 0x00, 0x34, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, + 0xeb, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x08, 0x86, 0xb1, 0x14, + 0x14, 0x00, 0x00, 0x30, 0x5a, 0xad, 0x48, 0x00, 0xff, 0x00, + 0x00, 0xff, 0x00, 0xff, 0xda, 0x00, 0x00, 0x00, 0x62, 0x4b, + 0x22, 0x49, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x24, 0x64, + 0x00, 0x00, 0xff, 0x00, 0x9d, 0x00, 0x4b, 0xc1, 0x00, 0x00, + 0x00, 0x00, 0x2e, 0x00, 0x11, 0x2b, 0x8f, 0xff, 0x63, 0x00, + 0xd4, 0x12, 0xff, 0xcc, 0x00, 0x00, 0x00, 0x93, 0x5f, 0x1b, + 0x0c, 0x00, 0x43, 0x00, 0x6f, 0x00, 0x00, 0x00, 0xff, 0x00, + 0x00, 0x55, 0x00, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x35, 0xff, 0x00, 0x00, 0x15, 0xf9, 0x00, 0x00, 0xe8, + 0x00, 0xff, 0xff, 0x00, 0x00, 0x02, 0x4f, 0x4a, 0x18, 0xc4, + 0x22, 0x09, 0x00, 0x6b, 0x00, 0x00, 0x08, 0xff, 0x00, 0x00, + 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, + 0x2f, 0xdf, 0x00, 0x00, 0x97, 0xff, 0x00, 0x00, 0xba, 0x00, + 0xff, 0xa5, 0x00, 0x00, 0x00, 0x9c, 0x13, 0x12, 0xbb, 0x00, + 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x11, + 0x00, 0x00, 0x00, 0x8a, 0x07, 0x00, 0x00, 0x00, 0x00, 0x36, + 0xd2, 0x8a, 0x6b, 0x00, 0xa2, 0x08, 0x33, 0xf1, 0x0d, 0x2a, + 0xc6, 0x00, 0x00, 0x00, 0x36, 0x54, 0x1a, 0x9e, 0x00, 0x10, + 0x00, 0x59, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0xaf, + 0x56, 0x36, 0x18, 0xb5, 0x00, 0x00, 0x87, 0x00, 0x09, 0xaf, + 0x00, 0x00, 0x00, 0xec, 0x3b, 0x22, 0x64, 0x00, 0x47, 0x00, + 0xb7, 0x00, 0x00, 0x00, 0x9e, 0x15, 0x00, 0x3b, 0x00, 0x00, + 0x00, 0x63, 0xa5, 0x00, 0x18, 0x00, 0x00, 0x34, 0x97, 0x00, + 0x9e, 0x23, 0xa2, 0x64, 0x57, 0xff, 0x00, 0xf2, 0xef, 0x00, + 0x00, 0x00, 0xf2, 0x4f, 0x1b, 0x5e, 0x06, 0x00, 0x00, 0x8c, + 0x00, 0x00, 0x0a, 0xd6, 0x00, 0x00, 0x38, 0x00, 0x61, 0x00, + 0x90, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x2d, 0x00, 0xc8, 0x80, + 0x00, 0xa9, 0x01, 0x01, 0xfb, 0x4b, 0xff, 0xcb, 0x20, 0x00, + 0x00, 0xff, 0x4e, 0x13, 0x94, 0x00, 0x00, 0x00, 0x54, 0x00, + 0x00, 0x00, 0xff, 0x00, 0x00, 0xe9, 0x00, 0x14, 0x31, 0x0c, + 0xd1, 0x00, 0x00, 0x00, 0x00, 0x35, 0x97, 0x46, 0x00, 0x0d, + 0xff, 0x00, 0x00, 0xff, 0x00, 0xb7, 0xbf, 0x00, 0x00, 0x00, + 0xa0, 0x3d, 0x11, 0x9f, 0x00, 0x51, 0x00, 0x7f, 0x00, 0x00, + 0x00, 0x96, 0x00, 0x00, 0x82, 0x00, 0x6a, 0x00, 0x51, 0xae, + 0x00, 0x00, 0x00, 0x00, 0x31, 0xff, 0x00, 0xae, 0x00, 0xf2, + 0x79, 0x00, 0xd7, 0x00, 0xff, 0xef, 0x05, 0x00, 0x00, 0xf0, + 0x44, 0x20, 0xff, 0xed, 0x4d, 0x00, 0x63, 0x00, 0x00, 0x81, + 0xff, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x35, 0x95, 0xff, 0x9d, 0x00, 0xff, 0x00, + 0x00, 0xff, 0x62, 0xff, 0xbb, 0x00, 0x00, 0x00, 0x9e, 0x3c, + 0x1e, 0x97, 0x1f, 0x52, 0x00, 0x53, 0x00, 0x00, 0x43, 0xff, + 0x00, 0x00, 0xa6, 0x00, 0x72, 0x00, 0xfb, 0x4b, 0x01, 0x00, + 0x00, 0x00, 0x31, 0x86, 0x00, 0x36, 0x00, 0xf9, 0x00, 0xa3, + 0xff, 0xed, 0xff, 0xb5, 0x00, 0x00, 0x00, 0x6a, 0x3c, 0x19, + 0x8c, 0x63, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0xff, 0x00, + 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00, + 0x00, 0x34, 0x9e, 0x11, 0x2b, 0x00, 0x36, 0x00, 0x00, 0xd6, + 0x30, 0xff, 0x96, 0x00, 0x00, 0x00, 0x4b, 0x27, 0x1d, 0x31, + 0x00, 0x25, 0x00, 0x53, 0x00, 0x00, 0x00, 0xad, 0x00, 0x6c, + 0x70, 0x00, 0x0a, 0x00, 0x62, 0x28, 0x00, 0x00, 0x00, 0x00, + 0x32, 0xba, 0x49, 0x3e, 0x13, 0xeb, 0x27, 0x14, 0xff, 0x11, + 0x09, 0xdd, 0x00, 0x00, 0x00, 0xff, 0x3e, 0x1b, 0x92, 0x00, + 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00, 0xf2, 0x00, 0x00, 0xf5, + 0x00, 0x16, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x33, + 0x45, 0x01, 0x37, 0x96, 0xff, 0x00, 0xa8, 0xff, 0x00, 0xff, + 0xd5, 0x00, 0x00, 0x00, 0xb0, 0x2e, 0x19, 0xbe, 0x00, 0x00, + 0x00, 0x80, 0x00, 0x00, 0x3e, 0xf9, 0x00, 0x00, 0x69, 0x00, + 0x3d, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0xd3, + 0x0e, 0x2a, 0x3c, 0xc3, 0x00, 0x32, 0xff, 0x71, 0xff, 0x77, + 0x00, 0x00, 0x00, 0xff, 0x56, 0x23, 0x9c, 0x00, 0x52, 0x00, + 0x7d, 0x00, 0x16, 0x5d, 0xa2, 0x00, 0x00, 0x77, 0x00, 0x16, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x33, 0x46, 0x00, + 0x2c, 0x17, 0xdc, 0x0a, 0x00, 0xff, 0x00, 0xff, 0xba, 0x00, + 0x00, 0x00, 0x9e, 0x33, 0x27, 0x5f, 0x3f, 0x00, 0x00, 0x75, + 0x00, 0x00, 0xbd, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0xe6, 0x00, 0x00, + 0x00, 0xc2, 0xbb, 0x00, 0xff, 0x2a, 0xff, 0xff, 0x00, 0x00, + 0x00, 0x47, 0x51, 0x15, 0x71, 0x00, 0x00, 0x00, 0x8a, 0x00, + 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, + 0x6b, 0x00, 0x00, 0x00, 0x00, 0x30, 0x3e, 0x5d, 0x8b, 0x32, + 0xde, 0x00, 0x00, 0xff, 0x49, 0xff, 0x92, 0x00, 0x00, 0x00, + 0x08, 0x4f, 0x12, 0xb3, 0x00, 0x71, 0x00, 0x83, 0x00, 0x0c, + 0x00, 0xe1, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x57, + 0x00, 0x18, 0x00, 0x00, 0x2f, 0xd2, 0x00, 0x00, 0x55, 0xd9, + 0x5c, 0x26, 0xff, 0x00, 0xff, 0xc0, 0x00, 0x00, 0x00, 0xe2, + 0x3c, 0x23, 0x18, 0x0e, 0x28, 0x00, 0x66, 0x00, 0x00, 0x00, + 0xff, 0x00, 0x00, 0x26, 0x00, 0xb4, 0x00, 0x5f, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x35, 0xc6, 0x84, 0x01, 0x53, 0xff, 0x00, + 0x00, 0xff, 0x00, 0xfe, 0x7f, 0x10, 0x00, 0x00, 0xed, 0x2d, + 0x1d, 0xb2, 0x55, 0x00, 0x00, 0x5e, 0x00, 0x26, 0xd2, 0xff, + 0x00, 0x00, 0x03, 0x00, 0x5b, 0x00, 0x2e, 0x20, 0x00, 0x00, + 0x00, 0x00, 0x34, 0xff, 0x9a, 0x3e, 0x00, 0xff, 0x00, 0x00, + 0xff, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x26, 0x55, 0x08, + 0x6a, 0x00, 0x1d, 0x00, 0x83, 0x00, 0x00, 0x7a, 0xff, 0x00, + 0x00, 0x28, 0x00, 0xe7, 0x00, 0x08, 0x0b, 0x0b, 0x00, 0x00, + 0x00, 0x30, 0xab, 0x00, 0x00, 0x00, 0x64, 0x0c, 0x1e, 0xe4, + 0x27, 0xff, 0x93, 0x05, 0x00, 0x00, 0xff, 0x00, 0x2b, 0x8a, + 0x65, 0x0a, 0x00, 0x5f, 0x00, 0x00, 0x30, 0xff, 0x00, 0x00, + 0x45, 0x00, 0x7d, 0x00, 0x19, 0x6c, 0x00, 0x00, 0x00, 0x00, + 0x3b, 0xf9, 0x00, 0x00, 0x06, 0xa7, 0x00, 0x6f, 0xfd, 0x40, + 0xff, 0xb7, 0x00, 0x00, 0x00, 0xff, 0x22, 0x1e, 0xb5, 0x00, + 0x15, 0x00, 0x92, 0x00, 0x00, 0x54, 0xff, 0x00, 0x00, 0x66, + 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x33, + 0x5b, 0x82, 0x26, 0x60, 0xff, 0x00, 0x00, 0xff, 0x6c, 0xff, + 0xfe, 0x00, 0x00, 0x00, 0x89, 0x63, 0x14, 0x69, 0xe5, 0x00, + 0x00, 0x80, 0x00, 0x00, 0x2b, 0xcf, 0x00, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x84, 0x3d, 0x00, 0x00, 0x00, 0x00, 0x37, 0xaf, + 0x00, 0x00, 0x3e, 0xff, 0x00, 0x42, 0xff, 0x00, 0xff, 0xff, + 0x00, 0x00, 0x04, 0x09, 0x3f, 0x1e, 0xaf, 0x17, 0x00, 0x00, + 0x5d, 0x00, 0x00, 0x22, 0xff, 0x00, 0x1c, 0x56, 0x00, 0x00, + 0x00, 0xd5, 0x19, 0x00, 0x00, 0x00, 0x00, 0x35, 0xc9, 0xff, + 0x00, 0x00, 0x88, 0x00, 0x0b, 0xff, 0x23, 0xff, 0xff, 0x00, + 0x00, 0x00, 0x0b, 0x4e, 0x17, 0x3c, 0x00, 0x00, 0x00, 0x61, + 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0xb8, 0x00, 0xff, 0x00, + 0x45, 0x07, 0x00, 0x00, 0x00, 0x00, 0x2d, 0xe5, 0x00, 0x47, + 0x01, 0xff, 0x00, 0x00, 0xf7, 0x27, 0xff, 0x44, 0x00, 0x00, + 0x00, 0x4e, 0x5e, 0x17, 0xb2, 0x00, 0x00, 0x00, 0x5a, 0x00, + 0x00, 0x00, 0xff, 0x00, 0x36 +}; + +int check_output() { + printf("Checking the output vector:\n"); + + int n_err = 0; + for (int i = 0; i < OUTPUT_SIZE; i++) { + if (output[i] != golden_output[i]) { + printf("ERROR: wrong value of output @ %d: %d vs. golden: %d\n", i, output[i], golden_output[i]); + n_err++; + } + } + + if (n_err == 0) + printf("> Success! No errors found.\n"); + else + printf("> Failure! Found %d/%d errors.\n", n_err, OUTPUT_SIZE); + return n_err; + } + + \ No newline at end of file diff --git a/neureka/pointwise/src/scale.c b/neureka/pointwise/src/scale.c new file mode 100644 index 0000000..5eb2c42 --- /dev/null +++ b/neureka/pointwise/src/scale.c @@ -0,0 +1,10 @@ +#include "scale.h" + +#define SCALE_SIZE (39) +PI_L1 uint8_t scale[SCALE_SIZE] = { + 0x1e, 0x02, 0x1a, 0x1d, 0x15, 0x1b, 0x13, 0x07, 0x15, 0x0e, + 0x01, 0x11, 0x1b, 0x11, 0x0e, 0x11, 0x19, 0x13, 0x0d, 0x13, + 0x1e, 0x0f, 0x0b, 0x0b, 0x05, 0x1b, 0x04, 0x02, 0x0c, 0x14, + 0x0c, 0x04, 0x05, 0x0f, 0x07, 0x0f, 0x0f, 0x16, 0x1c +}; + diff --git a/neureka/pointwise/src/weight.c b/neureka/pointwise/src/weight.c new file mode 100644 index 0000000..140952b --- /dev/null +++ b/neureka/pointwise/src/weight.c @@ -0,0 +1,1130 @@ +#include "weight.h" + +#define WEIGHT_SIZE (11232) +PI_L1 uint8_t weight[WEIGHT_SIZE] = { + 0x9f, 0x3b, 0x8a, 0x72, 0xf7, 0x43, 0x00, 0x69, 0xa1, 0xc9, + 0x09, 0x17, 0xa7, 0xab, 0xc5, 0xf7, 0xa7, 0xfa, 0x97, 0xa8, + 0xa5, 0x92, 0x11, 0xdf, 0xa7, 0x92, 0x91, 0xff, 0x58, 0x6d, + 0x6e, 0x00, 0x18, 0xb6, 0x1f, 0x16, 0x49, 0x91, 0x15, 0x60, + 0x06, 0x38, 0x2d, 0x55, 0x2d, 0xa3, 0x6e, 0x7e, 0xd2, 0x59, + 0x80, 0x79, 0x8f, 0x11, 0x4a, 0x7e, 0x8f, 0x11, 0x4a, 0x7e, + 0x70, 0xee, 0xb5, 0x81, 0x68, 0x39, 0xc7, 0xb5, 0x59, 0x5c, + 0xcd, 0x89, 0xfa, 0x75, 0x35, 0xdd, 0x0d, 0x89, 0x5c, 0xcd, + 0x9a, 0xee, 0x05, 0xba, 0x3d, 0xbc, 0x05, 0xca, 0x1d, 0xbc, + 0x05, 0xca, 0xe2, 0x43, 0xfa, 0x35, 0xee, 0xf7, 0xfa, 0x2d, + 0x88, 0xfb, 0x79, 0x5b, 0x2a, 0xc9, 0x15, 0x24, 0xe4, 0xbe, + 0x84, 0x12, 0x1d, 0x2d, 0x28, 0x42, 0xc6, 0xac, 0x00, 0xb2, + 0xc6, 0xac, 0x00, 0x12, 0x39, 0x53, 0xff, 0xed, 0xe0, 0x32, + 0xc6, 0x63, 0x0c, 0xea, 0x91, 0xf4, 0x00, 0x85, 0x05, 0x7f, + 0x6c, 0x88, 0x55, 0x3e, 0xfd, 0x98, 0x18, 0x1d, 0x6d, 0x9e, + 0x3b, 0x25, 0x6d, 0x9e, 0x3b, 0x25, 0x92, 0x61, 0xc4, 0xda, + 0x1e, 0x15, 0x6a, 0x82, 0x3d, 0x68, 0xeb, 0x2f, 0x6e, 0x76, + 0x1d, 0x53, 0xbb, 0x91, 0x91, 0xa7, 0x5c, 0x4a, 0xfa, 0x3a, + 0xc9, 0x01, 0x2a, 0x1b, 0xc9, 0x01, 0x6a, 0x1b, 0x36, 0xfe, + 0x95, 0xe4, 0x13, 0x69, 0xdd, 0xf5, 0xdc, 0xed, 0xb0, 0x39, + 0xdc, 0xda, 0xf6, 0x76, 0xaf, 0x02, 0xaa, 0xc4, 0x42, 0x3a, + 0xe9, 0x4f, 0x4a, 0x1b, 0xff, 0x4f, 0xca, 0x1a, 0xfb, 0x4f, + 0x35, 0xe5, 0x04, 0xb0, 0x2f, 0x58, 0xab, 0xfa, 0x4c, 0x98, + 0x7f, 0x23, 0x21, 0xa9, 0xfd, 0xce, 0x34, 0xf0, 0x3a, 0x67, + 0x5b, 0x89, 0x6f, 0x3b, 0x01, 0x88, 0x26, 0x3b, 0x01, 0x88, + 0x2e, 0xbb, 0xfe, 0x77, 0xd1, 0x44, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf7, 0xd1, + 0x59, 0xb4, 0x26, 0x04, 0x6e, 0x6b, 0xb0, 0x5c, 0x36, 0xaf, + 0xde, 0x83, 0x05, 0xfe, 0x12, 0xc3, 0xe5, 0x71, 0xf6, 0x8f, + 0x0c, 0xcd, 0xf6, 0x8f, 0x24, 0xcd, 0x09, 0x70, 0xdb, 0x32, + 0xb0, 0x48, 0x23, 0x45, 0xb7, 0x93, 0xae, 0xcc, 0x66, 0x56, + 0x86, 0x3f, 0xa5, 0xe8, 0x76, 0xbf, 0xcf, 0x08, 0xa1, 0xed, + 0xff, 0x08, 0x1d, 0xad, 0xff, 0x08, 0x15, 0xad, 0x00, 0xf7, + 0xea, 0x52, 0x78, 0x44, 0x14, 0xba, 0x06, 0x42, 0x66, 0xcb, + 0xc3, 0xe8, 0x9d, 0x75, 0x48, 0x9b, 0x03, 0x94, 0x7b, 0x80, + 0xff, 0x8a, 0xfb, 0x11, 0xce, 0x96, 0x5b, 0x11, 0xce, 0x92, + 0xa4, 0xee, 0x31, 0x6d, 0xe5, 0xe4, 0x8d, 0x49, 0x4d, 0xb4, + 0x11, 0x98, 0x0c, 0x8e, 0x94, 0xbd, 0x3e, 0xfc, 0xbc, 0x24, + 0xe0, 0x7c, 0xfa, 0x85, 0x95, 0xcc, 0x9d, 0x87, 0x94, 0x4c, + 0x9d, 0x87, 0x6b, 0xb3, 0x62, 0x78, 0x01, 0x54, 0xa5, 0xaa, + 0xd9, 0xdd, 0x03, 0xe6, 0xfc, 0x51, 0xce, 0x8e, 0xd4, 0x0c, + 0x6a, 0xe7, 0x4c, 0xf4, 0xfd, 0x13, 0x58, 0x74, 0x87, 0xb3, + 0x58, 0x74, 0x87, 0xbb, 0xa7, 0x8b, 0x78, 0x44, 0x8a, 0x0d, + 0x4a, 0xe6, 0x03, 0x6e, 0x2c, 0xfe, 0xb5, 0x8b, 0x96, 0xbd, + 0xc1, 0xdc, 0xf7, 0xb1, 0x51, 0xc7, 0x5f, 0x03, 0x51, 0xf7, + 0x7b, 0x95, 0x51, 0xe7, 0x7b, 0x95, 0xae, 0x18, 0x84, 0x6a, + 0x54, 0x66, 0x1d, 0xda, 0x22, 0x63, 0xdd, 0x34, 0x7a, 0x6a, + 0x2e, 0x56, 0x99, 0x82, 0x9a, 0xc7, 0xf2, 0x7e, 0x90, 0x5e, + 0x7a, 0x4e, 0xda, 0x76, 0xfa, 0x4e, 0xda, 0x76, 0x05, 0xb1, + 0x25, 0x89, 0x0d, 0xef, 0x72, 0xd8, 0x3c, 0xe2, 0x2a, 0x60, + 0x9d, 0xd5, 0xf0, 0x43, 0x2b, 0x50, 0x86, 0x30, 0xea, 0x59, + 0x9d, 0x45, 0xe8, 0x18, 0xd9, 0xe7, 0xe8, 0x18, 0xd9, 0xe7, + 0x17, 0xe7, 0x26, 0x18, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x4a, 0x80, 0xce, + 0xd0, 0x4e, 0x9b, 0x8a, 0xba, 0xa5, 0xd9, 0x53, 0x75, 0x4c, + 0x98, 0x23, 0xcc, 0xe4, 0xd7, 0x31, 0x3d, 0x00, 0xfd, 0x0b, + 0x7d, 0x00, 0xdd, 0x0b, 0x82, 0xff, 0x22, 0xf4, 0x75, 0x49, + 0x5f, 0xcc, 0xdb, 0xae, 0xde, 0xab, 0xdd, 0x30, 0x8b, 0x2c, + 0xa6, 0xc4, 0xb5, 0xf1, 0x97, 0x01, 0x7a, 0x23, 0xa3, 0x95, + 0x0e, 0x2d, 0xa3, 0x95, 0x2e, 0x2d, 0x5c, 0x6a, 0xd1, 0xd2, + 0x65, 0xf0, 0x53, 0x99, 0xf0, 0x91, 0xef, 0x32, 0x96, 0xf7, + 0x12, 0xc3, 0xe0, 0x17, 0x13, 0xac, 0x3c, 0x53, 0x2f, 0x1f, + 0x54, 0x77, 0x33, 0xde, 0x14, 0x77, 0x33, 0x9e, 0xeb, 0x88, + 0xcc, 0x61, 0x02, 0x02, 0xca, 0xed, 0xa2, 0xa4, 0xcf, 0x20, + 0xf7, 0x92, 0xdb, 0x88, 0x11, 0x86, 0xee, 0xcf, 0x9a, 0xe0, + 0xc9, 0x6f, 0x9b, 0x82, 0x88, 0xee, 0x9b, 0x82, 0xc8, 0xee, + 0x64, 0x7d, 0x37, 0x11, 0x46, 0x01, 0xa3, 0xbc, 0x0e, 0xc4, + 0x72, 0xd1, 0xc3, 0x85, 0xad, 0xcb, 0x11, 0x25, 0x56, 0x89, + 0x4e, 0x27, 0xcd, 0xcd, 0xa6, 0xa7, 0x6c, 0x7d, 0x86, 0x27, + 0xec, 0xfd, 0x79, 0xd8, 0x13, 0x02, 0xa0, 0x26, 0xfe, 0xe8, + 0xfc, 0xf5, 0x64, 0x2d, 0x59, 0xa7, 0x92, 0xd4, 0x5a, 0xd3, + 0xbe, 0x77, 0x08, 0x4e, 0x04, 0x86, 0x50, 0xce, 0x36, 0x84, + 0x50, 0xce, 0x36, 0x84, 0xaf, 0x31, 0xc9, 0x7b, 0xbb, 0x25, + 0x67, 0x70, 0x0c, 0x20, 0x92, 0x7f, 0x30, 0xc2, 0xd0, 0x88, + 0x27, 0xaa, 0x32, 0xaf, 0x65, 0xd0, 0x19, 0x4f, 0x83, 0x98, + 0x1e, 0xb3, 0x03, 0x98, 0x1e, 0xb3, 0xfc, 0x67, 0xe1, 0x4c, + 0x73, 0x1e, 0x1f, 0xd8, 0xb7, 0x16, 0x15, 0x8b, 0x96, 0xc5, + 0x15, 0xa3, 0xa5, 0x2b, 0xae, 0x7a, 0x30, 0xb5, 0x0f, 0x89, + 0x2f, 0x92, 0x37, 0xa8, 0x27, 0x92, 0x17, 0xa8, 0xd8, 0x6d, + 0xe8, 0x57, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x4b, 0x8a, 0x05, 0xdc, 0x97, 0x61, + 0xcf, 0x05, 0xb4, 0x98, 0x55, 0x98, 0xc4, 0xc5, 0xd8, 0xb1, + 0x28, 0x49, 0x74, 0x9d, 0x89, 0x8b, 0x74, 0xbd, 0x89, 0xcb, + 0x74, 0xbd, 0x76, 0x34, 0x8b, 0x42, 0xe3, 0x67, 0x8e, 0x28, + 0x5e, 0x2e, 0x32, 0x1f, 0xdf, 0x9e, 0xa1, 0x1f, 0xcf, 0x0f, + 0x1b, 0x16, 0xaa, 0x3e, 0x33, 0x31, 0x83, 0x37, 0xb1, 0x75, + 0x83, 0x36, 0xb1, 0x75, 0x7c, 0xc9, 0x4e, 0x8a, 0xe4, 0x3f, + 0xc5, 0x8b, 0xd2, 0x10, 0xf2, 0xc2, 0x99, 0x4f, 0x15, 0xe1, + 0x72, 0xfc, 0x83, 0xf4, 0x79, 0x03, 0x05, 0x83, 0x79, 0x0f, + 0xc7, 0x80, 0x79, 0x0f, 0xc7, 0x80, 0x86, 0xf0, 0x38, 0x7f, + 0x91, 0x81, 0xee, 0x27, 0xf3, 0x13, 0xbd, 0x47, 0x55, 0xbf, + 0xe1, 0x5e, 0xa4, 0x47, 0xed, 0xf3, 0x1b, 0x7b, 0x41, 0x5b, + 0x26, 0x5b, 0xe5, 0x7b, 0x26, 0x5b, 0x65, 0x7b, 0xd9, 0xa4, + 0x9a, 0x84, 0x3b, 0x70, 0x1e, 0x38, 0x31, 0x21, 0x9c, 0x48, + 0xe7, 0x9c, 0x1b, 0x8d, 0x67, 0xc9, 0x9c, 0x89, 0x26, 0x1a, + 0xc3, 0x2f, 0x27, 0xd8, 0xf3, 0xbb, 0x27, 0x98, 0xb3, 0xab, + 0xd8, 0x67, 0x4c, 0x54, 0x48, 0xbd, 0xf9, 0x4f, 0x2c, 0x37, + 0xc9, 0xdd, 0x4d, 0xef, 0x59, 0x09, 0x73, 0x9d, 0xb8, 0x36, + 0x9b, 0x06, 0x11, 0xc0, 0xb3, 0x16, 0x5d, 0x09, 0xb3, 0x16, + 0x59, 0x09, 0x4c, 0xe9, 0xa6, 0xf6, 0x25, 0xf5, 0xf8, 0x08, + 0x9b, 0x75, 0x87, 0x32, 0x0a, 0xd0, 0x64, 0x04, 0xbb, 0x9d, + 0x5c, 0x30, 0x75, 0xa3, 0x9e, 0x55, 0xb7, 0x94, 0xcd, 0x01, + 0xb7, 0x97, 0xcd, 0x01, 0x48, 0x68, 0x32, 0xfe, 0x4f, 0x22, + 0x33, 0xd1, 0x46, 0x2a, 0x5d, 0xc7, 0x69, 0x59, 0xf0, 0xc2, + 0x1e, 0xc6, 0xb6, 0xca, 0xa0, 0x1f, 0xd6, 0x1f, 0x2b, 0xc6, + 0xdc, 0xfb, 0x29, 0xc6, 0xd4, 0xfb, 0xd6, 0x39, 0x2b, 0x04, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x78, 0xdb, 0x23, 0x5a, 0xc8, 0xac, 0xb1, 0x69, + 0x2d, 0x8a, 0xc1, 0x6a, 0x7b, 0x9c, 0x59, 0x1c, 0xc6, 0x09, + 0x3e, 0x88, 0xfe, 0x00, 0x5b, 0x68, 0xfe, 0x00, 0x5b, 0x68, + 0x01, 0xff, 0xa4, 0x97, 0x62, 0xd2, 0xf9, 0xdf, 0x04, 0x54, + 0x66, 0xcf, 0x69, 0x13, 0x3e, 0x79, 0xc4, 0xa3, 0x28, 0x1c, + 0x81, 0xb4, 0x86, 0xa5, 0xc4, 0xa4, 0xa8, 0x11, 0xc4, 0xa4, + 0xa0, 0x11, 0x3b, 0x5b, 0x5f, 0xee, 0xbe, 0x03, 0x85, 0x09, + 0x4c, 0x70, 0xa5, 0x8a, 0x8a, 0xe9, 0x64, 0x34, 0x4d, 0x6b, + 0x08, 0xe0, 0x6f, 0x60, 0xdc, 0x8c, 0x4f, 0x72, 0xc8, 0xa4, + 0x4f, 0x70, 0xc8, 0xac, 0xb0, 0x8f, 0x37, 0x53, 0xd0, 0xd6, + 0xbe, 0x82, 0x0b, 0x60, 0xbc, 0x0b, 0xea, 0xc9, 0x00, 0xb8, + 0x4d, 0xf7, 0xc5, 0x2b, 0x43, 0xc2, 0xcd, 0x2c, 0x4b, 0x4a, + 0x85, 0x16, 0x4b, 0xc8, 0xc5, 0x16, 0xb4, 0x37, 0x3a, 0xe9, + 0x1f, 0x07, 0x35, 0x29, 0x54, 0x16, 0x57, 0xfc, 0xac, 0x94, + 0x3b, 0xf2, 0xb7, 0x54, 0xa6, 0x89, 0xc4, 0xa5, 0x94, 0x63, + 0x85, 0x87, 0xe3, 0xe7, 0x85, 0x87, 0xa2, 0xe3, 0x7a, 0x78, + 0x5d, 0x1c, 0x11, 0xa7, 0xf8, 0x78, 0x8c, 0x11, 0xa1, 0xe8, + 0x75, 0xbf, 0x6e, 0x3f, 0x05, 0xe7, 0x2d, 0xae, 0x9e, 0xed, + 0x9a, 0x32, 0xd1, 0xef, 0xdb, 0x22, 0xd1, 0xef, 0x9b, 0x22, + 0x2e, 0x10, 0x64, 0xdd, 0x00, 0xb6, 0x03, 0x5d, 0x88, 0x5b, + 0x8f, 0xc7, 0x71, 0x40, 0x83, 0x28, 0x6f, 0x1d, 0xf3, 0xf3, + 0x7c, 0x40, 0xd4, 0xe9, 0x4c, 0x51, 0xc8, 0xe3, 0x4c, 0x51, + 0xc4, 0xe3, 0xb3, 0xae, 0x3b, 0x1c, 0xa8, 0x94, 0x08, 0x8f, + 0x88, 0x6d, 0x96, 0x3d, 0xcd, 0x50, 0xa6, 0xf4, 0x58, 0x41, + 0xd7, 0xa1, 0x58, 0x3a, 0x2d, 0x4e, 0x74, 0x78, 0x2f, 0xcc, + 0x74, 0x78, 0x2f, 0xcc, 0x8b, 0x87, 0xd0, 0x33, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0xca, 0xd3, 0x0c, 0x83, 0x34, 0xb6, 0x8c, 0x65, 0xb7, 0x0f, + 0x20, 0x7d, 0xf7, 0xaa, 0x57, 0xc3, 0x1b, 0x4a, 0xb4, 0xcf, + 0x6b, 0x2a, 0xb4, 0x76, 0x7b, 0x2a, 0xb4, 0x76, 0x84, 0xd5, + 0x4b, 0x89, 0x47, 0xe6, 0x1e, 0xd2, 0x99, 0x2c, 0x18, 0xf7, + 0x80, 0x02, 0x61, 0x8a, 0x12, 0xfc, 0x89, 0xa3, 0x99, 0x93, + 0x64, 0xec, 0x8d, 0xb2, 0xe1, 0xf7, 0x89, 0xb3, 0xe1, 0xf7, + 0x76, 0x4c, 0x1e, 0x08, 0x1f, 0xff, 0x88, 0x4b, 0x72, 0x31, + 0x0e, 0x14, 0xd1, 0x68, 0x91, 0x9d, 0x2d, 0x90, 0xa8, 0x8d, + 0xc1, 0x85, 0xf8, 0x6e, 0x14, 0x80, 0xb2, 0x6d, 0x04, 0x80, + 0xb8, 0x6d, 0xfb, 0x7f, 0x47, 0x92, 0x6c, 0x25, 0xfe, 0xbb, + 0x22, 0x74, 0xac, 0x32, 0x08, 0xd0, 0xb9, 0x98, 0x8a, 0x2c, + 0xc9, 0xba, 0xc4, 0x03, 0x70, 0x24, 0xcc, 0x07, 0xbd, 0x38, + 0xcc, 0x07, 0xbc, 0x38, 0x33, 0xf8, 0x43, 0xc7, 0x03, 0x8f, + 0x12, 0x85, 0x7e, 0xc2, 0xa2, 0x58, 0x2e, 0xdf, 0xee, 0xd9, + 0x5a, 0x4e, 0x35, 0x0f, 0xdc, 0x19, 0x68, 0xc3, 0x56, 0x9a, + 0x01, 0x5b, 0x5e, 0x1a, 0x41, 0x5b, 0xa1, 0xe5, 0xbe, 0xa4, + 0x61, 0x55, 0xb5, 0x36, 0x7c, 0xaf, 0x79, 0x28, 0xee, 0x65, + 0x4c, 0x17, 0x29, 0xa6, 0x9a, 0xe1, 0xdb, 0xd6, 0xfa, 0x83, + 0x5f, 0xe8, 0xac, 0x29, 0x5d, 0xc6, 0xa8, 0x29, 0xa2, 0x39, + 0x57, 0xd6, 0xf4, 0xa1, 0x69, 0x75, 0x8b, 0x66, 0x66, 0xbd, + 0xe9, 0xd5, 0x3f, 0xda, 0x69, 0x1e, 0x0a, 0x95, 0x47, 0xe9, + 0x00, 0x5b, 0xa3, 0xe0, 0x79, 0x0e, 0xa3, 0xe8, 0x58, 0x1a, + 0x5c, 0x17, 0xa7, 0xe5, 0xc2, 0x17, 0xe0, 0xda, 0x9e, 0x76, + 0x68, 0x43, 0x21, 0xd1, 0xff, 0x6c, 0x24, 0x85, 0x79, 0x50, + 0x75, 0xf4, 0x02, 0xcb, 0xe5, 0xfc, 0x44, 0xe7, 0x65, 0xf4, + 0x40, 0xee, 0x9a, 0x0b, 0xbf, 0x11, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x5f, + 0x4b, 0x87, 0xac, 0x0d, 0xca, 0xed, 0x8e, 0x1c, 0x34, 0xe7, + 0xaa, 0x88, 0xa2, 0x36, 0x5c, 0x47, 0xa8, 0xd2, 0x00, 0x04, + 0xeb, 0x96, 0x04, 0x04, 0xea, 0x96, 0xfb, 0xfb, 0x15, 0x69, + 0x0c, 0x78, 0xa1, 0x71, 0x29, 0x4b, 0x5e, 0x7f, 0x2a, 0xdd, + 0x8e, 0x99, 0xc7, 0x3a, 0x91, 0x94, 0xee, 0x9b, 0x1d, 0x25, + 0xbe, 0x11, 0x15, 0x00, 0xbe, 0x19, 0x1d, 0x00, 0x41, 0xe6, + 0xe2, 0xff, 0x88, 0xdb, 0x6b, 0x02, 0x47, 0xe6, 0x35, 0xe8, + 0xa7, 0x9b, 0x5b, 0x99, 0xc6, 0xb8, 0xd7, 0xa2, 0x08, 0x03, + 0x76, 0x40, 0xe0, 0x97, 0xfb, 0xba, 0xc0, 0x97, 0xfb, 0xaa, + 0x3f, 0x68, 0x04, 0x55, 0x4a, 0x42, 0x8e, 0xa3, 0x33, 0x66, + 0xa3, 0x39, 0x4e, 0xc0, 0xf9, 0x7e, 0x05, 0x86, 0x00, 0x34, + 0x8b, 0x7f, 0x5e, 0x3b, 0x80, 0x6f, 0x5c, 0xbb, 0x81, 0x6f, + 0x5c, 0xbb, 0x7e, 0x90, 0xa3, 0x44, 0xea, 0xb7, 0xac, 0x08, + 0x62, 0x5d, 0x4a, 0x25, 0x38, 0xd8, 0xef, 0xb8, 0x15, 0xa7, + 0xe9, 0xda, 0x80, 0xcf, 0x71, 0x23, 0x16, 0xa6, 0x3b, 0x73, + 0x14, 0xae, 0x7b, 0x73, 0xeb, 0x51, 0x84, 0x8c, 0x7a, 0x61, + 0x8d, 0x92, 0x48, 0x9f, 0xd7, 0x0e, 0x62, 0x8e, 0x5a, 0x58, + 0x47, 0x60, 0x35, 0x9c, 0x5a, 0x6e, 0x10, 0x2f, 0x0a, 0x19, + 0x1c, 0x3f, 0x0a, 0x19, 0x1c, 0x3f, 0xf5, 0xe6, 0xe3, 0xc0, + 0x07, 0x93, 0x9e, 0x91, 0x7b, 0x6d, 0x0a, 0x4a, 0x2a, 0x21, + 0xdb, 0xfb, 0x40, 0x1c, 0x96, 0xff, 0xa1, 0x8e, 0x14, 0xde, + 0x12, 0xff, 0x30, 0x7e, 0x02, 0xff, 0x12, 0xfe, 0xfd, 0x00, + 0xed, 0x01, 0x42, 0x11, 0x91, 0xaa, 0xfe, 0x89, 0x1f, 0x1a, + 0x9b, 0xb7, 0xb2, 0xf6, 0xef, 0x73, 0x92, 0x49, 0x8d, 0x84, + 0xcc, 0x21, 0x98, 0x86, 0x98, 0x06, 0x98, 0x86, 0x98, 0x02, + 0x67, 0x79, 0x67, 0xfd, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x38, 0xcd, 0x21, 0x7b, + 0x6a, 0xaf, 0x7c, 0xf0, 0xc4, 0x58, 0x07, 0x7f, 0x2d, 0xfb, + 0xdb, 0xfe, 0xa7, 0x81, 0x9d, 0x89, 0x8d, 0xd9, 0xd9, 0xca, + 0xad, 0xd9, 0xd9, 0xca, 0x52, 0x26, 0x26, 0x35, 0x26, 0xe5, + 0xf5, 0xfe, 0x3b, 0xa4, 0x71, 0x1b, 0x44, 0xcb, 0xfd, 0xb6, + 0x9a, 0xf3, 0x27, 0x22, 0xc8, 0xfa, 0x78, 0x6c, 0xe9, 0xfa, + 0x6e, 0x74, 0xe8, 0xfa, 0x6e, 0x74, 0x17, 0x05, 0x91, 0x8b, + 0x93, 0x49, 0xa2, 0x64, 0xfc, 0x78, 0x4b, 0xb5, 0x20, 0x51, + 0x90, 0xb5, 0x4e, 0x65, 0x9e, 0xa3, 0x28, 0xa0, 0x04, 0xba, + 0x40, 0xa6, 0x48, 0x39, 0x40, 0xa6, 0x48, 0x39, 0xbf, 0x59, + 0xb7, 0xc6, 0x2f, 0xf4, 0xdf, 0x7b, 0x46, 0x8f, 0x30, 0x10, + 0x65, 0x59, 0x79, 0x8e, 0xc7, 0x96, 0xe4, 0xed, 0x3f, 0x2b, + 0x6d, 0x9f, 0xae, 0xad, 0xe9, 0x99, 0xae, 0xaf, 0x69, 0x9d, + 0x51, 0x50, 0x96, 0x62, 0xb9, 0x23, 0x8e, 0xa3, 0x0b, 0x15, + 0x87, 0x3a, 0x9b, 0x56, 0x2d, 0x71, 0x0e, 0x35, 0xb6, 0xea, + 0x8d, 0xce, 0xa1, 0x29, 0x8e, 0x2a, 0xfa, 0x0b, 0x8e, 0x0a, + 0xea, 0x09, 0x71, 0xf5, 0x15, 0xf6, 0x61, 0x8e, 0xce, 0xa5, + 0x47, 0xce, 0x20, 0x70, 0x3b, 0xb3, 0x8b, 0xa1, 0x0b, 0xbe, + 0x46, 0x5f, 0x2a, 0x45, 0xdc, 0x1b, 0x8b, 0x9a, 0x10, 0xbf, + 0x8b, 0x9a, 0x90, 0xbf, 0x74, 0x65, 0x6f, 0x40, 0xfd, 0x99, + 0x2b, 0x2c, 0x34, 0xd2, 0x98, 0x6c, 0xa2, 0x96, 0x7f, 0xeb, + 0x01, 0x7f, 0xce, 0xf3, 0x57, 0xde, 0x3a, 0xbc, 0x73, 0xbe, + 0x1d, 0xd9, 0x73, 0xbe, 0x1f, 0xd9, 0x8c, 0x41, 0xe0, 0x26, + 0xce, 0xa8, 0xec, 0x37, 0xd6, 0x13, 0x47, 0xb2, 0x58, 0xbd, + 0x88, 0xa5, 0x22, 0xbb, 0x85, 0x61, 0xaf, 0xbd, 0x6d, 0x5a, + 0x32, 0x89, 0xed, 0x6b, 0x3a, 0x99, 0xed, 0x6b, 0xc5, 0x66, + 0x12, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x7c, 0x06, 0x63, 0x8a, 0x5e, 0x18, + 0xbb, 0x88, 0x3a, 0x7a, 0xf9, 0xd3, 0x4d, 0x85, 0xfe, 0x2a, + 0x19, 0x8f, 0x50, 0xc3, 0xcd, 0x23, 0x9f, 0xcc, 0xcd, 0x2b, + 0xdf, 0xce, 0x32, 0xd4, 0x20, 0x31, 0x81, 0x32, 0xe7, 0x3e, + 0x93, 0x3a, 0x17, 0x89, 0x21, 0x48, 0xf5, 0xd7, 0x60, 0x44, + 0x37, 0x04, 0x78, 0x53, 0xb3, 0xed, 0xb0, 0xcc, 0x3e, 0xad, + 0xb0, 0xc4, 0x36, 0xad, 0x4f, 0x3b, 0xc9, 0x52, 0x3a, 0x01, + 0x6b, 0xa2, 0xf2, 0x11, 0x50, 0x3d, 0x30, 0xb4, 0x5e, 0xfe, + 0x87, 0x19, 0x58, 0x7c, 0x7d, 0x90, 0xa4, 0x7a, 0xb9, 0x3c, + 0xb8, 0xf6, 0xb9, 0x3c, 0xb8, 0xfe, 0x46, 0xc3, 0x47, 0x01, + 0xbb, 0xb5, 0xfc, 0x72, 0xc6, 0x36, 0x7c, 0xbd, 0x08, 0x16, + 0xf8, 0x39, 0x0e, 0xcf, 0xed, 0xa8, 0x1a, 0x8f, 0xd0, 0x19, + 0x53, 0xe7, 0x55, 0x39, 0x13, 0xe7, 0x55, 0x39, 0xec, 0x18, + 0xaa, 0xc6, 0xc2, 0x10, 0x66, 0xbf, 0x83, 0xbf, 0x7a, 0x46, + 0x84, 0xe2, 0x90, 0xf9, 0x16, 0xf6, 0x0d, 0x2e, 0x1c, 0x8f, + 0xb0, 0x81, 0x08, 0x96, 0xf4, 0x1b, 0x18, 0x96, 0xf4, 0x1b, + 0xe7, 0x69, 0x0b, 0xe4, 0x7f, 0x88, 0xfd, 0xa9, 0xaa, 0x9c, + 0xb8, 0x31, 0x91, 0xf0, 0xe1, 0xa6, 0xe2, 0x4e, 0x8d, 0x99, + 0x15, 0xbe, 0x00, 0x0c, 0x79, 0xfa, 0x69, 0x97, 0x79, 0xfe, + 0x49, 0x95, 0x86, 0x01, 0xb6, 0x6a, 0xc8, 0xde, 0x64, 0x2c, + 0x07, 0x35, 0xdf, 0x26, 0x62, 0x1f, 0xa3, 0xc3, 0x95, 0xfc, + 0x2d, 0x04, 0x46, 0x90, 0x31, 0xbd, 0x25, 0x7d, 0x73, 0x0c, + 0x25, 0x7c, 0x73, 0x0c, 0xda, 0x83, 0x8c, 0xf3, 0xfe, 0x6b, + 0xd7, 0x46, 0x86, 0xf6, 0x0b, 0x5e, 0x55, 0xde, 0x4c, 0x5d, + 0x67, 0xec, 0x0f, 0x6c, 0x7c, 0x97, 0x6a, 0x2a, 0xe6, 0x8f, + 0x2a, 0xa3, 0x66, 0xdf, 0x2a, 0xab, 0x99, 0x20, 0xd5, 0x54, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x12, 0x00, 0x77, 0x8b, 0xee, 0x27, 0x42, 0x81, + 0x7c, 0x43, 0x31, 0xf3, 0x45, 0x89, 0xed, 0x09, 0x18, 0x79, + 0xda, 0x83, 0xce, 0xf8, 0x4f, 0x03, 0xce, 0xf8, 0xdb, 0x03, + 0x31, 0x07, 0x24, 0xfc, 0x84, 0xb3, 0x04, 0x7b, 0x22, 0xf6, + 0xa6, 0x45, 0x8a, 0x6a, 0xcb, 0x6f, 0x5e, 0x6b, 0x31, 0x55, + 0xe7, 0x2d, 0x17, 0x2a, 0xf4, 0x03, 0x33, 0x3b, 0xf6, 0x43, + 0x33, 0x3b, 0x09, 0xbc, 0xcc, 0xc4, 0x16, 0x4c, 0x18, 0xd8, + 0xc8, 0xa2, 0xa8, 0x45, 0xed, 0xd9, 0xc8, 0x6a, 0xe1, 0x39, + 0x8c, 0x26, 0x2c, 0x21, 0x8e, 0x26, 0x40, 0xf1, 0x86, 0x04, + 0x78, 0x71, 0x8e, 0x26, 0x87, 0x8e, 0x71, 0xd9, 0x61, 0x1e, + 0xb0, 0x82, 0x4b, 0x54, 0xb5, 0xde, 0x8d, 0xba, 0x26, 0xc7, + 0xbd, 0x54, 0xc7, 0x6f, 0x2c, 0x1f, 0x50, 0x38, 0x18, 0x5d, + 0x40, 0x3e, 0x18, 0x5d, 0x40, 0x3e, 0xe7, 0xa2, 0xbf, 0xc1, + 0x2d, 0x17, 0x5a, 0xb0, 0xa9, 0xdd, 0x60, 0x10, 0xe9, 0xd3, + 0x15, 0x8a, 0x7d, 0x94, 0x20, 0xf1, 0x8e, 0x0c, 0xdb, 0x48, + 0xef, 0x4b, 0xab, 0x9e, 0xef, 0x4b, 0xab, 0xde, 0x10, 0xb4, + 0x54, 0x21, 0x2c, 0x3e, 0x3a, 0x45, 0x53, 0x0e, 0x7f, 0x2f, + 0x44, 0x31, 0x55, 0x48, 0xec, 0x31, 0x2b, 0x38, 0xb4, 0xa0, + 0xe1, 0x0a, 0xbc, 0x0b, 0x61, 0x2b, 0xbc, 0x03, 0x61, 0x6b, + 0x43, 0xfc, 0x9e, 0xd4, 0x6c, 0x85, 0xa1, 0x59, 0x6c, 0x25, + 0x95, 0x9c, 0x63, 0xd9, 0xbc, 0x35, 0x18, 0x03, 0x89, 0x6b, + 0xc7, 0xd2, 0xf5, 0xee, 0xe3, 0x71, 0xc5, 0xaa, 0xe3, 0x51, + 0xc5, 0xea, 0x1c, 0xae, 0x3a, 0x15, 0x05, 0x1b, 0x2c, 0xd0, + 0x30, 0xc2, 0x94, 0x31, 0x54, 0xc8, 0x8b, 0x8f, 0xde, 0x62, + 0x1d, 0x58, 0x99, 0x77, 0x2c, 0xc9, 0x1c, 0xf3, 0x34, 0x4d, + 0x1c, 0xf3, 0x3c, 0x4d, 0xe3, 0x0c, 0xc3, 0xb2, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x14, 0x2e, 0x50, 0x55, 0x21, 0x4f, 0xb4, 0x93, 0x75, 0x95, + 0x3f, 0x2f, 0x6e, 0x70, 0x02, 0x44, 0x9e, 0xd4, 0xee, 0x10, + 0xeb, 0x75, 0x42, 0x4c, 0xeb, 0x75, 0x42, 0x44, 0x14, 0x8a, + 0xbd, 0xbb, 0xef, 0x2d, 0xb8, 0x77, 0x9f, 0x2c, 0x1d, 0x29, + 0xe0, 0x05, 0x9c, 0x5a, 0xcf, 0xf6, 0xf6, 0x79, 0x4b, 0x06, + 0xac, 0xef, 0xaa, 0x94, 0xac, 0xed, 0xca, 0x14, 0xac, 0xef, + 0x35, 0xeb, 0x53, 0x10, 0xb9, 0x22, 0x21, 0xcc, 0x95, 0x41, + 0x36, 0xec, 0x7c, 0x83, 0x66, 0xe0, 0x36, 0x48, 0x65, 0xe4, + 0x7a, 0xf2, 0xe2, 0x56, 0xb6, 0xf0, 0xe0, 0xee, 0xb6, 0xf0, + 0xe0, 0xee, 0x49, 0x0f, 0x1f, 0x11, 0x7d, 0x76, 0xd8, 0xa0, + 0xc1, 0x10, 0xb9, 0x4c, 0xcd, 0xb2, 0xa4, 0x27, 0x53, 0x97, + 0xe3, 0x7e, 0xe0, 0xfb, 0x95, 0x97, 0x61, 0x6a, 0x96, 0x1b, + 0x61, 0x7b, 0x96, 0x9b, 0x9e, 0x84, 0x69, 0x64, 0x3e, 0x37, + 0xa1, 0xb8, 0x3a, 0x09, 0xee, 0x34, 0x2a, 0xd8, 0x7a, 0x51, + 0x8b, 0x6c, 0xe8, 0x7d, 0x9c, 0x51, 0x8f, 0xc3, 0x90, 0x1d, + 0xbe, 0xc4, 0x90, 0x19, 0xbf, 0xc4, 0x6f, 0xe6, 0x40, 0x3b, + 0x13, 0xa1, 0x45, 0x1b, 0x0b, 0x8b, 0x22, 0xec, 0x34, 0x25, + 0x8e, 0x44, 0x8c, 0x9b, 0x72, 0x12, 0xb8, 0x8b, 0x5f, 0x54, + 0xb0, 0x70, 0x1a, 0x74, 0xb0, 0xba, 0x1a, 0x14, 0x4f, 0x45, + 0xe5, 0xeb, 0x52, 0xdc, 0x5b, 0x77, 0xdb, 0xa7, 0xac, 0x22, + 0x15, 0xf9, 0xd9, 0xa4, 0xc9, 0x6a, 0xf6, 0x9f, 0xb3, 0x62, + 0xd4, 0x57, 0xb1, 0x6a, 0xa9, 0xc5, 0xb1, 0x6a, 0xb9, 0xc5, + 0x4e, 0x95, 0x46, 0x3a, 0x82, 0xa5, 0x81, 0x62, 0x47, 0xa5, + 0xa6, 0xca, 0x90, 0x67, 0xee, 0x22, 0x57, 0x4d, 0x11, 0x48, + 0xe9, 0x55, 0x2d, 0xf7, 0x7a, 0x17, 0x29, 0x8e, 0x7a, 0x55, + 0x29, 0xce, 0x85, 0xaa, 0xd6, 0x31, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0xea, + 0x42, 0x9b, 0x87, 0x51, 0xd4, 0xb1, 0x0d, 0xa3, 0x13, 0x96, + 0xed, 0xf5, 0x90, 0xce, 0x49, 0x6b, 0xd9, 0xb8, 0x19, 0x62, + 0x70, 0xb2, 0x19, 0x63, 0x50, 0xb2, 0xe6, 0x9c, 0xaf, 0x4d, + 0x25, 0x1a, 0x7e, 0x0d, 0x88, 0x4b, 0x2e, 0xb3, 0xea, 0x33, + 0x81, 0xe3, 0xb2, 0x90, 0xcb, 0x97, 0xd5, 0x33, 0x13, 0x84, + 0xb4, 0x03, 0x11, 0x31, 0xb4, 0x03, 0x11, 0x10, 0x4b, 0xfc, + 0xee, 0xef, 0x5a, 0x56, 0xd7, 0x2a, 0xc2, 0x83, 0x8a, 0x07, + 0xa1, 0x6f, 0xf9, 0x84, 0x26, 0x21, 0x67, 0xa5, 0xe2, 0x8a, + 0xc6, 0xb3, 0x52, 0x0e, 0xce, 0x21, 0x52, 0x0e, 0xce, 0x21, + 0xad, 0xf1, 0x31, 0xde, 0x28, 0x6a, 0x17, 0x5b, 0x92, 0xd0, + 0xf5, 0x33, 0xfa, 0x22, 0x54, 0xf7, 0x82, 0xc3, 0x71, 0x73, + 0x80, 0x48, 0x1f, 0x7a, 0x9c, 0x70, 0x74, 0xba, 0x98, 0x70, + 0x55, 0xba, 0x67, 0x8f, 0xaa, 0x45, 0x28, 0x9f, 0xb7, 0xcc, + 0x72, 0x83, 0x2a, 0xe7, 0x9c, 0xa6, 0xa3, 0x8f, 0x38, 0xe7, + 0x6d, 0x7d, 0x3d, 0xb0, 0x83, 0xfb, 0x23, 0xf1, 0x82, 0xff, + 0x2b, 0xb1, 0x83, 0xff, 0xd4, 0x4e, 0x7c, 0x00, 0x7c, 0x79, + 0x36, 0xcc, 0x2d, 0xf5, 0xae, 0x00, 0x8d, 0x7b, 0xa7, 0xcf, + 0xb2, 0xde, 0xa2, 0x7c, 0xf2, 0xd5, 0x95, 0x0d, 0x94, 0xb0, + 0x75, 0x26, 0x96, 0xf0, 0x35, 0x24, 0x69, 0x0f, 0xca, 0xdb, + 0x20, 0x9d, 0x8f, 0x24, 0xc9, 0x97, 0x2b, 0x1e, 0xf5, 0xaf, + 0x7a, 0x22, 0xa6, 0xa7, 0x1e, 0x76, 0xf4, 0x95, 0x7e, 0x7a, + 0x3d, 0x97, 0x3e, 0xda, 0xbd, 0x97, 0x3e, 0xfa, 0x42, 0x68, + 0xc1, 0x05, 0x78, 0xe2, 0x57, 0x31, 0xcd, 0xe2, 0x53, 0x25, + 0x2a, 0x66, 0x68, 0x7d, 0x3d, 0x09, 0xcf, 0x8a, 0x29, 0xb7, + 0xe3, 0x4c, 0x0b, 0x77, 0x09, 0x01, 0x09, 0x77, 0x49, 0x01, + 0xf6, 0x88, 0xb6, 0xfe, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0xac, 0x21, 0x34, + 0xf3, 0xbb, 0x7f, 0x8f, 0x29, 0x3a, 0x0f, 0x0a, 0xb4, 0xb8, + 0x5d, 0x00, 0x89, 0x59, 0xd7, 0x43, 0x0b, 0x3a, 0x64, 0x19, + 0x09, 0x3a, 0x65, 0x08, 0xf6, 0xc5, 0x9a, 0xf7, 0xb2, 0x0d, + 0xb7, 0xbd, 0xbc, 0x0d, 0x50, 0xca, 0xd4, 0x1d, 0x93, 0x44, + 0xc0, 0xa0, 0x14, 0x0f, 0xfe, 0x7a, 0x3c, 0x1b, 0x37, 0xba, + 0x61, 0x77, 0x37, 0xba, 0x70, 0x7f, 0xc8, 0x45, 0x8f, 0x80, + 0x2e, 0x33, 0xd8, 0x88, 0x61, 0xa4, 0xce, 0x30, 0xe1, 0xea, + 0x8c, 0x6f, 0xf2, 0x1f, 0x2d, 0xaa, 0xb0, 0x7b, 0x9c, 0x43, + 0xd1, 0xff, 0xed, 0x17, 0xd0, 0xff, 0xad, 0x13, 0x2f, 0x00, + 0x52, 0xec, 0x9c, 0xda, 0x41, 0x4d, 0xaa, 0x56, 0xab, 0x95, + 0x5e, 0x54, 0xb2, 0xae, 0x30, 0xcd, 0x2c, 0x5b, 0x12, 0x29, + 0xe0, 0xe4, 0x02, 0x41, 0xbb, 0x65, 0x02, 0x49, 0xb9, 0x64, + 0xfd, 0xb6, 0x46, 0x9b, 0x35, 0xf0, 0xb1, 0x56, 0xa9, 0x8e, + 0x45, 0xa5, 0xe7, 0x9e, 0xf1, 0xc1, 0x65, 0xa3, 0x38, 0x96, + 0x73, 0x29, 0xa8, 0x51, 0x54, 0x77, 0x31, 0x4a, 0x65, 0x77, + 0xb1, 0x43, 0x9a, 0x88, 0x4e, 0xbc, 0x32, 0x24, 0x99, 0x19, + 0xd4, 0x08, 0x13, 0x2f, 0xbf, 0x5b, 0x34, 0xfb, 0x2f, 0x93, + 0xd1, 0x9b, 0xdc, 0x70, 0x6e, 0x19, 0x56, 0x53, 0x44, 0xf3, + 0x5e, 0x53, 0x44, 0x7b, 0xa1, 0xac, 0xbb, 0x84, 0x08, 0xd9, + 0x8e, 0x10, 0xb1, 0x88, 0x4e, 0x33, 0x86, 0xdd, 0xc1, 0xbc, + 0x0a, 0x83, 0x84, 0x68, 0x75, 0x6f, 0x67, 0x77, 0x44, 0x85, + 0x07, 0x76, 0x44, 0x8f, 0x07, 0x77, 0xbb, 0x70, 0xf8, 0x88, + 0x83, 0x18, 0x11, 0xdb, 0xf2, 0xea, 0x5e, 0x78, 0x9d, 0xb4, + 0xa7, 0xd0, 0x77, 0x32, 0xfd, 0xb2, 0x90, 0xad, 0xb7, 0xdf, + 0x30, 0x02, 0xff, 0xd0, 0x20, 0x00, 0xff, 0xd0, 0xcf, 0xff, + 0x00, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x80, 0xde, 0x9b, 0x24, 0x79, 0xde, + 0x52, 0x83, 0xf4, 0x1f, 0x16, 0x0a, 0xa5, 0xf0, 0x7a, 0xfc, + 0xd9, 0x0d, 0x5f, 0x0d, 0xf9, 0x9a, 0x33, 0x4b, 0xf9, 0x9a, + 0x33, 0x09, 0x06, 0x65, 0xcc, 0xf6, 0x35, 0x08, 0x23, 0xaa, + 0x06, 0xb1, 0x5b, 0x50, 0xdf, 0xa4, 0xa3, 0xcd, 0xc0, 0x08, + 0xe6, 0x85, 0xcd, 0xb1, 0x2d, 0x1a, 0xdf, 0x32, 0xe6, 0x11, + 0xcf, 0x30, 0xa7, 0x11, 0x30, 0xcf, 0x58, 0xee, 0x4c, 0xad, + 0x9a, 0x46, 0xe1, 0x56, 0x69, 0x88, 0x9a, 0xbe, 0xe3, 0x93, + 0xa5, 0x8d, 0x59, 0x33, 0x0a, 0x86, 0x13, 0x46, 0x08, 0xa6, + 0x93, 0x86, 0x08, 0xa6, 0x93, 0x06, 0xf7, 0x59, 0x6c, 0xf9, + 0xc0, 0x96, 0x57, 0x1e, 0xef, 0xc6, 0x33, 0x25, 0x5d, 0x30, + 0x18, 0x77, 0x04, 0x7f, 0x63, 0x76, 0xc3, 0x81, 0x5d, 0x55, + 0xdf, 0xe5, 0xda, 0x74, 0xd7, 0xe5, 0xda, 0x74, 0x28, 0x1a, + 0x25, 0x8b, 0x58, 0xa9, 0xfa, 0x1c, 0xd9, 0xb2, 0xaa, 0xa8, + 0x52, 0x74, 0xba, 0x35, 0x97, 0xc3, 0x11, 0x61, 0x0d, 0x89, + 0x5b, 0x70, 0x11, 0xed, 0x57, 0x73, 0x15, 0xed, 0x53, 0x71, + 0xea, 0x12, 0xac, 0x8e, 0xaf, 0x5b, 0xae, 0xc5, 0x6d, 0x5b, + 0x68, 0xc1, 0x77, 0x6d, 0xba, 0x91, 0xfb, 0xd0, 0x12, 0x23, + 0xbe, 0x02, 0x12, 0xfc, 0x2a, 0xb2, 0x88, 0xfc, 0xaa, 0x92, + 0x98, 0xfc, 0x55, 0x6d, 0x67, 0x03, 0xaf, 0x1b, 0xbf, 0x3e, + 0xe4, 0x3c, 0x17, 0x8b, 0x4d, 0x1f, 0x23, 0xa6, 0x59, 0x00, + 0x3b, 0x4e, 0x54, 0x67, 0xe3, 0xc9, 0x5d, 0x46, 0x01, 0x4b, + 0x5d, 0x44, 0x21, 0x4b, 0xa2, 0xbb, 0xde, 0xb4, 0xcf, 0x48, + 0x2a, 0x76, 0x2d, 0xb7, 0x31, 0x7f, 0xc7, 0x35, 0xb1, 0x88, + 0x94, 0x5f, 0xa7, 0xe9, 0xde, 0xb7, 0xbf, 0x5a, 0x16, 0x37, + 0xb7, 0xdd, 0x96, 0x37, 0xb7, 0xd9, 0x69, 0xc8, 0x48, 0x26, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x35, 0x0b, 0x81, 0x6e, 0x7b, 0xa4, 0x51, 0x57, + 0x5a, 0xb1, 0xff, 0xd2, 0x53, 0x0a, 0xdc, 0xfc, 0x94, 0x89, + 0x24, 0x1a, 0x00, 0x9e, 0x78, 0x3b, 0x10, 0x9b, 0x38, 0x3b, + 0xef, 0x64, 0xc7, 0xc4, 0xa4, 0x4f, 0xee, 0x1a, 0x58, 0xb4, + 0x30, 0x49, 0x03, 0xc1, 0x50, 0xdc, 0xb1, 0xec, 0x5b, 0x94, + 0x83, 0x06, 0x8e, 0x2c, 0x43, 0x84, 0xd9, 0xac, 0x03, 0x84, + 0xda, 0xac, 0xfc, 0x7b, 0x25, 0x53, 0x0c, 0x64, 0x40, 0xb9, + 0x49, 0x48, 0x41, 0x1d, 0xca, 0x4a, 0x67, 0x57, 0x1c, 0x2e, + 0x2b, 0x2d, 0xb4, 0x15, 0x0b, 0xb5, 0x7c, 0x80, 0x96, 0x9d, + 0x7c, 0x80, 0x9a, 0x9d, 0x83, 0x7f, 0x65, 0x62, 0x57, 0x29, + 0x49, 0x40, 0xbf, 0x1a, 0xbc, 0xa9, 0xbb, 0xa4, 0xb0, 0xa9, + 0xe1, 0x39, 0x3b, 0xd6, 0xc7, 0x04, 0x4f, 0xc1, 0x41, 0x2e, + 0x2f, 0x85, 0x41, 0x2c, 0x2f, 0xc5, 0xbe, 0xd3, 0xd0, 0x3a, + 0xbd, 0x99, 0x11, 0x63, 0xd0, 0xbd, 0x9f, 0xf4, 0xda, 0x71, + 0xf1, 0xbd, 0xae, 0xc2, 0x1f, 0xf7, 0x4e, 0x12, 0x29, 0x54, + 0x20, 0x52, 0x79, 0xde, 0x28, 0x52, 0x79, 0xd6, 0xd7, 0xad, + 0x86, 0x29, 0x3d, 0xe0, 0xe2, 0x58, 0xe4, 0xb3, 0x40, 0x94, + 0x24, 0xe9, 0x10, 0xea, 0x70, 0x36, 0x37, 0x84, 0x37, 0xf6, + 0xd6, 0x32, 0x35, 0xfc, 0x0a, 0xba, 0x35, 0xfc, 0x1a, 0xba, + 0xca, 0x03, 0xe5, 0x45, 0x5c, 0xd7, 0x39, 0x71, 0x09, 0x3b, + 0x31, 0x93, 0x18, 0x99, 0x9b, 0x43, 0x33, 0x39, 0x62, 0x7e, + 0xd9, 0x02, 0x22, 0xed, 0xd9, 0x08, 0x2a, 0xfe, 0xd9, 0x08, + 0x22, 0xff, 0x26, 0xf7, 0xdd, 0x00, 0xf8, 0x80, 0xe6, 0x7a, + 0x12, 0xe5, 0x3b, 0x51, 0x04, 0xf9, 0xcc, 0x3f, 0xcb, 0x42, + 0xeb, 0x28, 0xae, 0xee, 0x02, 0xd5, 0x0d, 0x18, 0x27, 0x15, + 0x0f, 0x18, 0x26, 0x15, 0xf0, 0xe7, 0xd9, 0xea, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x4b, 0x1d, 0x31, 0xeb, 0x9e, 0xea, 0x2f, 0x38, 0x01, 0x30, + 0xbe, 0x26, 0x94, 0x22, 0x25, 0x73, 0x29, 0x04, 0xe7, 0x8a, + 0x85, 0x80, 0x77, 0xfe, 0x84, 0x80, 0x77, 0xfe, 0x7b, 0x7f, + 0x88, 0x01, 0x9d, 0x4e, 0xd9, 0xec, 0xa2, 0xd8, 0xd5, 0xa9, + 0x8d, 0x42, 0x0f, 0x5e, 0x6f, 0xc6, 0xee, 0xfe, 0xca, 0x2c, + 0xd6, 0x5f, 0x66, 0xcd, 0xcf, 0xf2, 0x66, 0x8d, 0xcf, 0xde, + 0x99, 0x72, 0x30, 0x21, 0xc2, 0xfb, 0xea, 0x85, 0xb3, 0x92, + 0xf2, 0xf9, 0xa3, 0xc8, 0x45, 0xd3, 0x7d, 0x19, 0xf6, 0xcd, + 0xd4, 0xc3, 0x45, 0xe7, 0x54, 0xe3, 0x47, 0xdf, 0x54, 0xc3, + 0x47, 0xdf, 0xab, 0x3c, 0xb8, 0x20, 0x21, 0x36, 0xfa, 0xa6, + 0x35, 0x15, 0x0e, 0xf0, 0x28, 0x44, 0xba, 0x93, 0x70, 0xc8, + 0x9e, 0xaf, 0x3c, 0x25, 0x31, 0x0c, 0xdc, 0x55, 0xf0, 0xee, + 0x1c, 0x45, 0xf0, 0xee, 0xe3, 0xba, 0x0f, 0x11, 0xe0, 0x28, + 0xf4, 0x7e, 0xe5, 0x04, 0xa8, 0xe5, 0x5e, 0x48, 0x1b, 0xc0, + 0x8e, 0x17, 0x05, 0xa3, 0x2d, 0xd0, 0x3c, 0xf0, 0x6d, 0x89, + 0xb4, 0xe0, 0x6d, 0x88, 0xb4, 0xe0, 0x92, 0x77, 0x4b, 0x1f, + 0xc2, 0x0f, 0x82, 0x73, 0xb7, 0x27, 0x88, 0x2e, 0x27, 0x3e, + 0x89, 0xfb, 0x61, 0x37, 0xdd, 0x66, 0xb7, 0x19, 0x1e, 0x36, + 0x03, 0x59, 0x1c, 0xee, 0x23, 0x19, 0x1c, 0xa6, 0xdc, 0xe6, + 0xe3, 0x59, 0xa0, 0x0c, 0x00, 0xd8, 0x55, 0xd9, 0x56, 0x47, + 0x6e, 0x8b, 0xc9, 0xb5, 0xd7, 0x39, 0xfb, 0x1a, 0x4d, 0xbc, + 0xe5, 0x65, 0x4c, 0x53, 0xc5, 0x60, 0x4c, 0x19, 0xe5, 0x60, + 0xb3, 0xe6, 0x1a, 0x9f, 0xf7, 0x25, 0x77, 0x24, 0x51, 0x25, + 0x46, 0x11, 0x38, 0x62, 0xc7, 0x6b, 0x52, 0x9a, 0x56, 0x9b, + 0x22, 0x56, 0xfa, 0xd4, 0xaa, 0xb2, 0x53, 0x0d, 0x2a, 0xb2, + 0x53, 0x0d, 0xd5, 0x4d, 0xac, 0xf2, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x64, + 0x3e, 0x18, 0x3b, 0xcb, 0xeb, 0xf2, 0x02, 0x44, 0x7b, 0x05, + 0x15, 0x20, 0x2e, 0x45, 0x8f, 0x10, 0x33, 0xb6, 0x83, 0xe0, + 0xb2, 0x9d, 0x83, 0xe0, 0xb2, 0x9d, 0x7c, 0x1f, 0x4d, 0x62, + 0x8e, 0x8a, 0x04, 0x60, 0x17, 0x58, 0xc2, 0x72, 0x0f, 0x5a, + 0x00, 0x4e, 0xcc, 0xc9, 0x4f, 0xbf, 0xda, 0x59, 0x02, 0x39, + 0x4b, 0xd5, 0x76, 0x29, 0x4b, 0xd1, 0x56, 0x29, 0xb4, 0x2e, + 0xa9, 0xd6, 0x9f, 0x51, 0x50, 0xd6, 0xf7, 0x24, 0x06, 0xa5, + 0x0f, 0x2a, 0x25, 0xe3, 0x7b, 0xf1, 0x23, 0xb6, 0x2e, 0xa1, + 0xf2, 0xeb, 0xbe, 0xed, 0x60, 0xcf, 0xbe, 0xed, 0x60, 0x8f, + 0x41, 0x12, 0x9f, 0x70, 0x4a, 0xb0, 0x50, 0x89, 0xe3, 0xe3, + 0xd0, 0x22, 0x7e, 0x8f, 0x69, 0x3e, 0x35, 0x72, 0x52, 0xf7, + 0x35, 0x03, 0xed, 0xbf, 0xb5, 0x02, 0xcc, 0xbf, 0x35, 0x02, + 0xcc, 0xbf, 0xca, 0xfd, 0x33, 0x40, 0xb2, 0x0d, 0x23, 0xcc, + 0xf6, 0xcf, 0x89, 0xb3, 0x07, 0x63, 0x92, 0xad, 0x47, 0x3d, + 0x7d, 0xd3, 0xb3, 0x7e, 0xf0, 0x2c, 0xa7, 0x93, 0xe7, 0x98, + 0xa7, 0xbf, 0xf4, 0x89, 0x58, 0x40, 0x0b, 0x76, 0xda, 0x21, + 0x50, 0xb3, 0x08, 0x36, 0xcb, 0x8d, 0xd7, 0x74, 0xec, 0xaf, + 0x56, 0x85, 0x2c, 0xf6, 0x28, 0xab, 0xbf, 0xf8, 0xf4, 0x09, + 0x8b, 0xe2, 0xf4, 0x09, 0xaf, 0xe2, 0x0b, 0xf6, 0x50, 0x1d, + 0x56, 0x3e, 0xd4, 0x03, 0x57, 0x23, 0xf4, 0xa3, 0x10, 0xb0, + 0x8d, 0x06, 0x19, 0x69, 0x35, 0x6f, 0xfa, 0xc8, 0x37, 0x69, + 0x3f, 0x5d, 0x35, 0x6c, 0x3f, 0x88, 0x35, 0x6d, 0xc0, 0x77, + 0xca, 0x92, 0xc8, 0x3f, 0xce, 0x0c, 0xba, 0x1f, 0x44, 0xb7, + 0x68, 0xfb, 0xb8, 0x4a, 0xfa, 0x9a, 0x94, 0x72, 0x98, 0x40, + 0x89, 0xe2, 0x4f, 0x55, 0x98, 0xc2, 0xcf, 0x53, 0x98, 0xc2, + 0x30, 0xac, 0x67, 0x3d, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x43, 0x91, 0x3d, + 0xe4, 0x75, 0xe0, 0xb3, 0x42, 0x9e, 0x5c, 0x75, 0x28, 0x41, + 0x25, 0xfc, 0x3e, 0xa4, 0xc7, 0x67, 0x23, 0xc8, 0x44, 0x67, + 0x23, 0xc8, 0x44, 0x67, 0xdc, 0x37, 0xbb, 0x98, 0x18, 0xb7, + 0x3f, 0x2f, 0x0c, 0x34, 0x2e, 0xc8, 0x9f, 0x42, 0xa1, 0xc0, + 0x5b, 0x8a, 0xb0, 0x48, 0xc0, 0xbe, 0x45, 0x80, 0x4c, 0x3e, + 0xa0, 0x44, 0x4c, 0x3e, 0xa0, 0x40, 0xb3, 0xc1, 0x5f, 0xbf, + 0xb8, 0x25, 0xf4, 0xa7, 0x68, 0xb3, 0x3f, 0x24, 0x90, 0xf4, + 0xf8, 0x21, 0x7f, 0x86, 0x5d, 0xa6, 0xb4, 0xc9, 0x2b, 0x64, + 0xb9, 0xd0, 0xed, 0xac, 0xb9, 0xd0, 0xe9, 0x2c, 0x46, 0x2f, + 0x16, 0xd3, 0xd3, 0x0d, 0xa5, 0xf5, 0xb9, 0x05, 0xea, 0x11, + 0xfb, 0x92, 0xf6, 0x73, 0xc8, 0xea, 0xcd, 0xe8, 0x0e, 0xe0, + 0x65, 0x8f, 0xfc, 0x61, 0x14, 0xcc, 0xfc, 0x61, 0x54, 0xcc, + 0x03, 0x9e, 0xab, 0x33, 0xda, 0x88, 0xe9, 0xf2, 0xfd, 0xc8, + 0x19, 0x34, 0xde, 0x55, 0x52, 0x7b, 0xe8, 0xbe, 0xd3, 0xf8, + 0xa4, 0x55, 0x24, 0xc4, 0xe9, 0x85, 0xe7, 0xf9, 0xe9, 0x05, + 0x67, 0xfd, 0x16, 0xfa, 0x98, 0x02, 0xdd, 0xa3, 0x23, 0xec, + 0xd5, 0xf7, 0x75, 0x13, 0x5b, 0x8d, 0x54, 0x02, 0x6c, 0xc2, + 0x1e, 0x59, 0xc0, 0xfb, 0x5b, 0xc6, 0xf9, 0x5a, 0x77, 0xad, + 0xf8, 0xda, 0x7f, 0xe9, 0x07, 0x25, 0x80, 0x16, 0xaa, 0x2f, + 0xe7, 0xf3, 0x3b, 0x7d, 0x9d, 0x37, 0x97, 0x81, 0xa0, 0xa8, + 0x10, 0x43, 0xe4, 0x3a, 0x4f, 0xed, 0x38, 0xf2, 0x87, 0xe5, + 0x27, 0x36, 0x87, 0xe5, 0x25, 0x32, 0x78, 0x1a, 0xda, 0xcd, + 0x5d, 0x74, 0xe6, 0xf6, 0x2a, 0x7b, 0xd2, 0x57, 0x76, 0x48, + 0xe5, 0x26, 0x6b, 0x86, 0x70, 0x86, 0x56, 0xe3, 0x6a, 0xd7, + 0x12, 0x60, 0xce, 0x26, 0x16, 0x60, 0xea, 0x26, 0xe9, 0x9f, + 0x15, 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x39, 0xda, 0x11, 0x7f, 0x2c, 0x42, + 0xe4, 0x4b, 0xb8, 0xe0, 0x90, 0xd0, 0xbb, 0x18, 0x04, 0xc2, + 0x7b, 0x8f, 0x86, 0x4d, 0xa5, 0x8d, 0xa6, 0x50, 0xa9, 0x8d, + 0x86, 0x40, 0x56, 0x72, 0x79, 0xbf, 0xe5, 0x18, 0x44, 0x8b, + 0x1d, 0x18, 0xd4, 0x81, 0xa8, 0xed, 0xc3, 0x76, 0x96, 0x3b, + 0xd8, 0x31, 0x74, 0x15, 0x2c, 0x58, 0x46, 0xf5, 0x3e, 0x55, + 0x46, 0x55, 0x3c, 0x55, 0xb9, 0xaa, 0xc3, 0xaa, 0xcf, 0xa1, + 0xf7, 0x76, 0x8d, 0x9c, 0xef, 0x9a, 0xde, 0x2b, 0x7a, 0x49, + 0xc1, 0xc1, 0x38, 0xc8, 0x63, 0x45, 0x42, 0xeb, 0x67, 0x8d, + 0xb7, 0x79, 0x67, 0x85, 0x32, 0x79, 0x98, 0x7a, 0xcd, 0x86, + 0x05, 0xc0, 0xc6, 0xa4, 0xd0, 0xb0, 0x70, 0xd0, 0xed, 0x54, + 0xea, 0x94, 0xf4, 0xaa, 0x1d, 0x75, 0x91, 0x86, 0x3e, 0x6e, + 0x1c, 0xeb, 0x0e, 0x6e, 0x94, 0xeb, 0x2e, 0x6e, 0x6b, 0x14, + 0xd1, 0x91, 0x6a, 0xde, 0x6a, 0xcb, 0x82, 0x57, 0xf1, 0x95, + 0x49, 0xff, 0xe4, 0xf4, 0xd4, 0x20, 0xcc, 0xbd, 0x2c, 0xa4, + 0x80, 0x77, 0x05, 0x0c, 0xdb, 0xbe, 0x05, 0x0c, 0xd9, 0x3e, + 0xfa, 0xf3, 0x26, 0xc1, 0x9f, 0x92, 0x00, 0x8b, 0x5c, 0xeb, + 0x6a, 0xd2, 0x90, 0xab, 0x56, 0x2a, 0xf7, 0xc0, 0x64, 0x11, + 0xc6, 0xbc, 0x97, 0x60, 0x90, 0xbc, 0x54, 0x40, 0x90, 0xbc, + 0x54, 0x40, 0x6f, 0x43, 0xab, 0xbf, 0x8a, 0xc4, 0xce, 0x3f, + 0xae, 0x5b, 0xc5, 0x96, 0x94, 0xa0, 0x67, 0xa2, 0xc7, 0x75, + 0x8d, 0x84, 0x30, 0xdb, 0xa1, 0x8b, 0x4e, 0xfb, 0xe1, 0xd8, + 0x4e, 0xfb, 0xa1, 0x88, 0xb1, 0x04, 0x5e, 0x77, 0x8a, 0x55, + 0xbf, 0xfb, 0x04, 0xfe, 0x21, 0x99, 0x89, 0xbe, 0x4a, 0x99, + 0xb5, 0x42, 0x06, 0x84, 0xbc, 0xe0, 0xf0, 0xfc, 0x85, 0x00, + 0x05, 0xc3, 0x85, 0x00, 0x04, 0xc3, 0x7a, 0xff, 0xfb, 0x3c, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x22, 0xda, 0x8d, 0x48, 0xe1, 0x99, 0x28, 0x49, + 0x90, 0x69, 0x60, 0xbe, 0xc5, 0x88, 0x99, 0x9a, 0x1f, 0x3a, + 0x78, 0xf0, 0xcb, 0x31, 0x68, 0xc3, 0xcb, 0x31, 0x68, 0xc1, + 0x34, 0xce, 0x97, 0x3e, 0x34, 0xff, 0xe7, 0x3a, 0xe6, 0xe1, + 0xff, 0xe7, 0xaf, 0x75, 0x3d, 0xb9, 0x2a, 0x06, 0x5b, 0xae, + 0x87, 0x59, 0x16, 0xd3, 0xc3, 0x4c, 0x37, 0x53, 0xc3, 0x4c, + 0x37, 0xd3, 0x3c, 0xb3, 0xc8, 0x2c, 0x08, 0xdd, 0x74, 0x06, + 0x33, 0xf9, 0x61, 0xcc, 0x7c, 0x79, 0xfd, 0x3a, 0x6d, 0x4a, + 0x40, 0xb5, 0xea, 0xf0, 0x78, 0x98, 0xec, 0xda, 0x3d, 0xaa, + 0xec, 0xda, 0x3d, 0xaa, 0x13, 0x25, 0xc2, 0x55, 0xf2, 0x87, + 0x14, 0x8b, 0xb6, 0x29, 0x44, 0xe1, 0xb1, 0x10, 0x9b, 0x13, + 0x2f, 0xae, 0xb6, 0x63, 0x8f, 0x8c, 0xce, 0xab, 0x4e, 0x0f, + 0x12, 0xab, 0x4f, 0x0c, 0x92, 0xab, 0xb0, 0xf3, 0x6d, 0x54, + 0x55, 0x95, 0x9b, 0x2a, 0xf5, 0x17, 0xd0, 0x79, 0x5c, 0xc5, + 0x3a, 0xbe, 0xae, 0xf6, 0xb2, 0x68, 0x0a, 0x20, 0xc8, 0xd6, + 0x08, 0xc2, 0xcb, 0x68, 0x0a, 0xc0, 0xcb, 0x68, 0xf5, 0x3f, + 0x34, 0x97, 0x4c, 0x52, 0x17, 0x9b, 0x91, 0xcb, 0x84, 0x19, + 0x7e, 0x61, 0x1a, 0x45, 0x0b, 0x1f, 0x6c, 0x8a, 0xb4, 0x03, + 0xb9, 0x8f, 0x95, 0xa3, 0xba, 0x8d, 0x95, 0x23, 0xba, 0x8d, + 0x6a, 0xdc, 0x45, 0x72, 0x0d, 0xf0, 0x5b, 0x74, 0xe2, 0x5a, + 0x5b, 0xd0, 0x7b, 0x9b, 0x64, 0xe8, 0xb5, 0xbf, 0x72, 0x25, + 0xba, 0xd7, 0x2a, 0x2c, 0xbf, 0xa5, 0xc1, 0xfc, 0xbf, 0xa7, + 0x40, 0x7c, 0x40, 0x58, 0xbf, 0x83, 0x00, 0xbc, 0xc0, 0xc5, + 0xfc, 0xcc, 0x87, 0xf6, 0x82, 0xdb, 0x7f, 0x04, 0x38, 0xbe, + 0x87, 0xee, 0xae, 0x59, 0xc0, 0x5a, 0x3c, 0x5d, 0xc8, 0x56, + 0x3c, 0x5d, 0xc8, 0x5e, 0xc3, 0xa2, 0x37, 0xa1, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xc1, 0x7c, 0xa3, 0x86, 0x61, 0x37, 0x8d, 0xb0, 0xf0, 0xb9, + 0x6b, 0x2c, 0x31, 0xfa, 0xdf, 0xe6, 0x75, 0xbd, 0x0e, 0x31, + 0xb9, 0xb9, 0x9b, 0x20, 0xf1, 0xb9, 0x9f, 0x20, 0x0e, 0x46, + 0x60, 0xdf, 0xd2, 0x5e, 0xd3, 0xa5, 0xd5, 0xf2, 0xc8, 0xef, + 0xb3, 0x18, 0x09, 0x91, 0xaf, 0x4b, 0x40, 0x3b, 0x7a, 0xd1, + 0x73, 0xfb, 0x6e, 0x19, 0xf5, 0xe1, 0x7e, 0x19, 0xf5, 0xe1, + 0x81, 0xe6, 0x0a, 0x1e, 0x2a, 0x1e, 0x4e, 0x81, 0x9c, 0xf4, + 0x27, 0x36, 0x19, 0xba, 0xea, 0x28, 0x92, 0x1e, 0x34, 0x6b, + 0xfb, 0x5b, 0x58, 0x2e, 0x62, 0xaa, 0x3a, 0x34, 0x62, 0xaa, + 0x38, 0x34, 0x9d, 0x55, 0xc7, 0xcb, 0x89, 0x87, 0xa7, 0x3b, + 0xd6, 0x28, 0x54, 0xee, 0x3d, 0x26, 0x3d, 0x67, 0x47, 0x88, + 0xe6, 0x9a, 0xf4, 0x5a, 0x1a, 0x10, 0x27, 0xc0, 0x16, 0x9a, + 0x27, 0x48, 0x16, 0x9a, 0xd8, 0xb7, 0xe9, 0x65, 0x82, 0x75, + 0x6c, 0xad, 0xd8, 0xfe, 0x32, 0x42, 0x39, 0x26, 0x17, 0x9b, + 0x35, 0xf4, 0x6d, 0x26, 0x45, 0x9f, 0x6b, 0x5c, 0x50, 0x1d, + 0x22, 0x55, 0x50, 0x1d, 0x62, 0x45, 0xaf, 0xe2, 0x9d, 0xba, + 0x1d, 0x4e, 0xf8, 0xcb, 0x37, 0x49, 0x44, 0xcd, 0x29, 0xe3, + 0xf8, 0xfd, 0xa3, 0x23, 0x64, 0x27, 0x05, 0x49, 0xee, 0x3e, + 0xa5, 0x26, 0xec, 0x5f, 0xa5, 0x26, 0xec, 0x5f, 0x5a, 0xd9, + 0x13, 0xa0, 0xcb, 0xf5, 0x5a, 0xd2, 0x63, 0xec, 0xfb, 0xba, + 0x79, 0x40, 0x19, 0x76, 0xcc, 0xed, 0x49, 0x94, 0x2f, 0x43, + 0x01, 0x61, 0xa5, 0x45, 0x27, 0x76, 0xa5, 0x45, 0x07, 0x72, + 0x5a, 0xba, 0xf8, 0x8d, 0xdc, 0x78, 0x48, 0xb6, 0x79, 0xf7, + 0x3f, 0x87, 0x1c, 0xb1, 0x2c, 0xc6, 0xef, 0x6c, 0x62, 0x37, + 0x05, 0x08, 0xe7, 0xcc, 0xe5, 0x08, 0xae, 0x81, 0xc5, 0x08, + 0xae, 0x84, 0x3a, 0xf7, 0x51, 0x7b, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6a, 0x83, + 0xce, 0x4a, 0xa5, 0x2a, 0xf8, 0xbb, 0x6d, 0x45, 0x91, 0x4d, + 0x62, 0x8a, 0xd2, 0xb0, 0x02, 0xbc, 0x85, 0xc4, 0x53, 0x0a, + 0x16, 0x59, 0x53, 0x08, 0x96, 0x59, 0xac, 0xf7, 0x69, 0xa6, + 0xe9, 0x7e, 0xd1, 0x6c, 0x31, 0x53, 0x3f, 0xf9, 0xdd, 0x63, + 0x8c, 0x31, 0x2f, 0xb4, 0xaf, 0xe2, 0x85, 0x43, 0x81, 0x3e, + 0xd5, 0x66, 0x29, 0x6a, 0xd5, 0x66, 0x29, 0x6a, 0x2a, 0x99, + 0xd6, 0x95, 0x01, 0xea, 0x7a, 0x41, 0x90, 0x79, 0xdd, 0x28, + 0x88, 0x12, 0x58, 0x28, 0x7f, 0x3c, 0x30, 0x76, 0xdd, 0x22, + 0x71, 0x46, 0x5a, 0x22, 0xb3, 0x74, 0x5e, 0x22, 0x31, 0x64, + 0xa1, 0xdd, 0xce, 0x9b, 0xc9, 0x34, 0x49, 0xe9, 0x23, 0x17, + 0xa7, 0xe5, 0xe2, 0x03, 0x60, 0xb3, 0x65, 0x24, 0x24, 0xf8, + 0x79, 0xeb, 0xa9, 0xf0, 0x08, 0xe3, 0x05, 0xfa, 0x48, 0xe3, + 0x05, 0xf8, 0xb7, 0x1c, 0xfa, 0x07, 0xb7, 0x5f, 0x5f, 0x26, + 0x52, 0xde, 0x36, 0x18, 0x87, 0x3d, 0x4d, 0xc4, 0x84, 0x61, + 0x2e, 0x5b, 0xfa, 0x02, 0x43, 0xae, 0x1e, 0xa7, 0x0a, 0xa6, + 0x9e, 0xa7, 0x0a, 0xa4, 0x61, 0x58, 0xf5, 0x5b, 0xce, 0x40, + 0xf0, 0xe3, 0x30, 0xb1, 0x0d, 0xca, 0xdc, 0x63, 0x69, 0x1a, + 0xf1, 0x3c, 0x88, 0x5a, 0xd1, 0xd1, 0xde, 0x33, 0xa0, 0x71, + 0xac, 0x82, 0xb0, 0x71, 0x8c, 0x82, 0x4f, 0x8e, 0x73, 0x7d, + 0x15, 0x7e, 0x14, 0x61, 0x32, 0x62, 0x14, 0x08, 0x5b, 0x91, + 0x64, 0x3b, 0xdc, 0x7c, 0x64, 0x6c, 0x09, 0x92, 0xf0, 0xda, + 0x2d, 0x56, 0x64, 0xfc, 0x0d, 0x56, 0x66, 0xfc, 0xf2, 0xa9, + 0x9b, 0x03, 0x1e, 0x93, 0xf3, 0x4c, 0xd4, 0x2c, 0x98, 0xa2, + 0x8f, 0xfd, 0x9a, 0xd3, 0xc8, 0xe7, 0x9f, 0x9e, 0xb1, 0x14, + 0x6b, 0x98, 0x21, 0x14, 0x0b, 0x08, 0x31, 0x14, 0x0b, 0x08, + 0xce, 0xeb, 0xf4, 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb2, 0x73, 0xc5, 0x06, + 0xe6, 0xce, 0xd2, 0xe7, 0xd5, 0x7d, 0xf1, 0x55, 0xcc, 0xd9, + 0xf2, 0x91, 0xeb, 0x6e, 0x04, 0x02, 0x80, 0x64, 0xf3, 0x52, + 0xc0, 0x64, 0xf3, 0x12, 0x3f, 0x9b, 0x0c, 0xed, 0x03, 0x29, + 0xfb, 0x5d, 0xa6, 0x30, 0xd7, 0x96, 0xb2, 0xa5, 0xfa, 0x9d, + 0x80, 0x2c, 0xef, 0xed, 0xee, 0x93, 0x9c, 0x89, 0xa6, 0xf7, + 0x95, 0x9b, 0xa6, 0xb7, 0x95, 0x8b, 0x59, 0x48, 0x6a, 0x74, + 0x89, 0x28, 0x26, 0xa3, 0x73, 0xd3, 0xf4, 0xd6, 0x55, 0xce, + 0x4b, 0xcb, 0x6a, 0x2e, 0x64, 0x64, 0x16, 0x4e, 0xb1, 0xd0, + 0xb6, 0xed, 0xba, 0xdc, 0xb6, 0xee, 0xba, 0xdc, 0x49, 0x11, + 0x45, 0x23, 0x4c, 0x0c, 0x8e, 0x70, 0x70, 0x4a, 0x53, 0x5e, + 0x52, 0xeb, 0x38, 0xf2, 0xb9, 0x4f, 0x91, 0xee, 0x4a, 0xff, + 0x4a, 0xa8, 0x8c, 0xee, 0x80, 0xea, 0x0c, 0xee, 0x80, 0xea, + 0xf3, 0x11, 0x7f, 0x15, 0xbd, 0x66, 0xe5, 0x4a, 0xf7, 0x6d, + 0xa2, 0xbe, 0xfe, 0x87, 0xc1, 0xc9, 0x24, 0xa4, 0x7e, 0xb3, + 0x68, 0x06, 0xda, 0xad, 0xa0, 0x17, 0x52, 0xa9, 0xa4, 0x06, + 0x52, 0xa9, 0x5b, 0xf9, 0xad, 0x56, 0xad, 0xe7, 0x3e, 0x5e, + 0x0b, 0x9c, 0xcf, 0x74, 0xb4, 0x32, 0x8e, 0x88, 0xb6, 0x76, + 0x89, 0xf5, 0x85, 0x7f, 0x0a, 0x0f, 0x14, 0xb2, 0x29, 0x5f, + 0x14, 0xb6, 0x29, 0x5f, 0xeb, 0x49, 0xd6, 0xa0, 0x1f, 0x48, + 0x4b, 0xac, 0x49, 0x5b, 0xcb, 0x12, 0x6d, 0xc2, 0xda, 0x96, + 0x7c, 0xda, 0xfb, 0xbe, 0xd6, 0xf8, 0xaa, 0xfe, 0x53, 0x7a, + 0xe2, 0xfe, 0x5b, 0xfa, 0xea, 0xfe, 0xa4, 0x05, 0x15, 0x01, + 0x9a, 0xc6, 0xf6, 0x49, 0x9c, 0x36, 0x7c, 0xe7, 0xc6, 0x33, + 0x73, 0xd3, 0xf4, 0x03, 0xe3, 0x48, 0x56, 0x5d, 0x55, 0x1f, + 0x54, 0x13, 0x8f, 0xbe, 0x54, 0x13, 0x87, 0x9e, 0xab, 0xec, + 0x78, 0x61, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x40, 0x6a, 0xf8, 0x06, 0xc9, 0xa9, + 0xdd, 0x4b, 0x2f, 0x50, 0x9c, 0x1a, 0xeb, 0xb6, 0x24, 0xa9, + 0x73, 0xa2, 0xb0, 0x68, 0xbd, 0x82, 0xbc, 0x2c, 0xbf, 0x82, + 0xbc, 0x2c, 0x40, 0x7d, 0x43, 0xd3, 0xac, 0xfb, 0x7d, 0x7c, + 0x66, 0x6e, 0xe5, 0x0c, 0x97, 0xc8, 0x05, 0x2b, 0xcd, 0x02, + 0xdf, 0xf0, 0xcc, 0x43, 0x82, 0xbc, 0x89, 0x4a, 0xe7, 0xb1, + 0x8d, 0x4a, 0xc7, 0xb1, 0x72, 0xb5, 0x38, 0x4e, 0x08, 0x40, + 0x79, 0x74, 0x29, 0x9f, 0xbc, 0xb2, 0x4f, 0x81, 0x63, 0x49, + 0x1f, 0xb6, 0x70, 0x5b, 0x88, 0xe2, 0x47, 0x8e, 0xe8, 0xe0, + 0x04, 0xa2, 0xec, 0xe0, 0x04, 0xa2, 0x13, 0x1f, 0xfb, 0x5d, + 0x5e, 0xe2, 0x0d, 0x6e, 0x8f, 0x54, 0x00, 0x39, 0x79, 0x88, + 0x5f, 0x96, 0xa3, 0x30, 0xaa, 0x8b, 0x11, 0x56, 0xba, 0xf5, + 0x20, 0xe6, 0x3a, 0x4e, 0x20, 0xf6, 0xba, 0x4f, 0xdf, 0x09, + 0x45, 0xb0, 0x61, 0x3a, 0x89, 0x85, 0x99, 0x15, 0x41, 0x4b, + 0xea, 0x2f, 0xa1, 0x35, 0xba, 0x23, 0xe2, 0xd8, 0x0a, 0x43, + 0xa9, 0xc6, 0x68, 0x21, 0xa9, 0xd4, 0x6a, 0x23, 0xa9, 0xd0, + 0x95, 0xdc, 0x56, 0x2f, 0x22, 0x82, 0x8d, 0xa1, 0x5f, 0xd8, + 0xc0, 0x9f, 0x61, 0x63, 0x1b, 0xe4, 0x7c, 0x40, 0xa1, 0x32, + 0xbf, 0x0f, 0x79, 0x56, 0x3f, 0x4e, 0x19, 0x22, 0x3f, 0x4e, + 0x19, 0x26, 0xc0, 0xb1, 0xe6, 0xd9, 0x73, 0x1c, 0x55, 0xa0, + 0x61, 0xdd, 0xe3, 0x4b, 0x97, 0xb1, 0xe1, 0x31, 0x26, 0xd5, + 0xfb, 0xaa, 0x86, 0x65, 0x41, 0xab, 0x82, 0xcc, 0xc2, 0x3b, + 0x82, 0xcc, 0x83, 0x3b, 0x7d, 0x33, 0x3c, 0xc4, 0x81, 0xa5, + 0xf5, 0xe1, 0x9a, 0x9d, 0xba, 0x62, 0x10, 0xf7, 0xcd, 0x1f, + 0x63, 0x15, 0x35, 0x79, 0x23, 0x09, 0xf8, 0x18, 0x69, 0x59, + 0xfc, 0x58, 0x63, 0x59, 0xfc, 0x58, 0x9c, 0xa6, 0x03, 0xa7, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x3b, 0x3b, 0xd4, 0x29, 0xf6, 0x00, 0x55, + 0x52, 0xb9, 0x1d, 0x70, 0x46, 0x8d, 0x66, 0xdd, 0x55, 0xab, + 0xa6, 0x7f, 0x50, 0xb1, 0xee, 0xff, 0x10, 0xa1, 0xee, 0xff, + 0xef, 0x5e, 0x11, 0x00, 0x76, 0x2a, 0x8b, 0x86, 0x12, 0x2c, + 0xb8, 0x77, 0x7a, 0xc5, 0x6d, 0x83, 0x79, 0x58, 0x39, 0x84, + 0x86, 0xc4, 0x99, 0x4e, 0x3c, 0xb4, 0xd4, 0x40, 0x3c, 0xb4, + 0x94, 0x40, 0xc3, 0x4b, 0x6b, 0xbf, 0x01, 0xe4, 0x14, 0x40, + 0x64, 0xa1, 0x03, 0x58, 0xa7, 0x78, 0x65, 0x9b, 0x20, 0xd0, + 0x0e, 0x64, 0xd5, 0xdf, 0x5f, 0x88, 0x65, 0xfe, 0x6b, 0xa5, + 0x65, 0xfe, 0x6f, 0xa5, 0x9a, 0x01, 0x90, 0x5a, 0x44, 0x37, + 0xeb, 0xcf, 0xf0, 0x23, 0x0a, 0x0c, 0x55, 0xce, 0x91, 0x22, + 0xf7, 0xdd, 0xc8, 0x4e, 0xa7, 0xfa, 0x08, 0xaf, 0x9f, 0xd5, + 0x48, 0xd0, 0xb7, 0xdd, 0x48, 0xdd, 0x48, 0x22, 0xb7, 0x22, + 0xf6, 0x4d, 0x6d, 0x15, 0x05, 0xdb, 0x34, 0xb2, 0xa6, 0x73, + 0xa1, 0x01, 0x0c, 0x6e, 0x35, 0xd6, 0x58, 0x18, 0x80, 0x9c, + 0x4a, 0x08, 0x08, 0xb4, 0x48, 0x08, 0x08, 0xb4, 0xb7, 0xf7, + 0xf7, 0x4b, 0xd8, 0x1a, 0xc3, 0x7b, 0xd2, 0x74, 0x64, 0x21, + 0x0b, 0xc6, 0xd6, 0xb0, 0x68, 0x2a, 0xf3, 0xab, 0x74, 0x17, + 0x59, 0x69, 0x7c, 0x12, 0x88, 0xeb, 0x7c, 0x12, 0x98, 0xeb, + 0x83, 0xed, 0x67, 0x14, 0xd8, 0x4e, 0xad, 0x85, 0x03, 0xda, + 0x33, 0x63, 0xf3, 0x94, 0xdb, 0x23, 0xe1, 0x5d, 0x5f, 0x21, + 0x7b, 0xfd, 0x27, 0x1d, 0xec, 0x95, 0x1f, 0x53, 0xf9, 0x95, + 0x1f, 0x43, 0x06, 0x6a, 0xe0, 0xbc, 0x41, 0x9e, 0x3d, 0x77, + 0xc5, 0x9e, 0x5d, 0xba, 0x2b, 0x53, 0x0d, 0xae, 0xda, 0x92, + 0x81, 0x9c, 0x8a, 0xd6, 0x4b, 0xcb, 0xc6, 0x99, 0x46, 0xac, + 0xc6, 0x9d, 0x47, 0xac, 0x39, 0x62, 0xb8, 0x53, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x82, 0x87, 0x45, 0x45, 0x86, 0xdc, 0xb9, 0xe2, 0xca, 0x9b, + 0x84, 0xbb, 0x09, 0x78, 0x1b, 0xcf, 0x21, 0x2d, 0x2f, 0x53, + 0x85, 0xbe, 0x3e, 0xfb, 0x81, 0xbd, 0x3e, 0xfb, 0x7e, 0x42, + 0xc1, 0x04, 0xbb, 0x16, 0xd2, 0x57, 0x65, 0xb9, 0xde, 0x74, + 0x57, 0x17, 0xea, 0x24, 0x58, 0xf8, 0xb5, 0x62, 0x82, 0x5b, + 0x05, 0xbd, 0x97, 0x47, 0x64, 0x21, 0x93, 0x53, 0xe0, 0x21, + 0x6c, 0xac, 0x1f, 0xde, 0xee, 0x00, 0x50, 0x14, 0x63, 0xc9, + 0x80, 0xb5, 0xb0, 0x6c, 0x69, 0xfb, 0xcb, 0x54, 0xb3, 0x52, + 0x28, 0xa1, 0xde, 0x66, 0x32, 0x71, 0xa0, 0x52, 0x22, 0x71, + 0xa0, 0x52, 0xdd, 0x8e, 0x5f, 0xad, 0xae, 0x05, 0xf3, 0x7e, + 0x6e, 0x99, 0x69, 0xfa, 0x86, 0x04, 0xe6, 0x7f, 0xe1, 0xd1, + 0xdc, 0x00, 0x97, 0xd8, 0x6d, 0x5f, 0xc5, 0x5f, 0x48, 0x5f, + 0x85, 0x5d, 0x48, 0x5f, 0x7a, 0xa2, 0xb7, 0xa0, 0xb9, 0x99, + 0xd9, 0x4d, 0xbe, 0x56, 0xd4, 0xca, 0x76, 0xad, 0xf8, 0x9e, + 0xc1, 0xdd, 0x99, 0x0f, 0x7d, 0xb7, 0x3b, 0xa6, 0x75, 0x3f, + 0x7e, 0xb7, 0x75, 0xbf, 0x7b, 0xb7, 0x8a, 0x40, 0x84, 0x48, + 0x6f, 0x69, 0x1f, 0x7b, 0x05, 0xa6, 0xe5, 0xe7, 0xff, 0x7d, + 0x1a, 0x65, 0x1b, 0x25, 0x26, 0xee, 0xce, 0xb0, 0x18, 0x5a, + 0xde, 0xb1, 0x39, 0xc5, 0xde, 0xb1, 0x1b, 0xcc, 0x21, 0x4e, + 0xe4, 0x33, 0xcd, 0xac, 0x03, 0xb1, 0xef, 0xa5, 0xb5, 0x90, + 0x7b, 0xfb, 0x12, 0xf4, 0x7b, 0x35, 0xf6, 0x33, 0x97, 0xb3, + 0x55, 0x29, 0x61, 0xb3, 0x55, 0x31, 0x61, 0xb1, 0x55, 0x31, + 0x9e, 0x4e, 0xaa, 0xce, 0xf5, 0x1b, 0xc8, 0xed, 0x32, 0x8b, + 0xa8, 0x53, 0x22, 0x96, 0x56, 0x3a, 0x40, 0x49, 0x7e, 0x5e, + 0xb5, 0x20, 0x0f, 0x18, 0x86, 0x50, 0x2a, 0x2a, 0x86, 0x50, + 0x2a, 0x3a, 0x79, 0xaf, 0xd5, 0xc5, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x9d, 0xe1, + 0x7a, 0x57, 0x02, 0xc3, 0x96, 0x40, 0xd8, 0x31, 0x97, 0x0e, + 0x76, 0x69, 0xd4, 0xa8, 0x12, 0x56, 0x5b, 0xad, 0x30, 0x74, + 0x1c, 0x8a, 0x30, 0x74, 0x1d, 0x88, 0xcf, 0x8b, 0xe2, 0x77, + 0x77, 0xa3, 0x56, 0x23, 0x21, 0x03, 0xc4, 0xa9, 0x27, 0xdd, + 0x3d, 0x7e, 0x29, 0xa5, 0xca, 0xc6, 0x84, 0x4a, 0xda, 0x5a, + 0xa4, 0xe4, 0xd6, 0xca, 0xa4, 0xe4, 0xd6, 0xca, 0x5b, 0x1b, + 0x29, 0x35, 0xd3, 0x77, 0x5a, 0x14, 0x6a, 0x59, 0xdc, 0xaf, + 0x98, 0xda, 0x97, 0x10, 0x8b, 0xa1, 0x8c, 0xf0, 0xe1, 0x38, + 0xaf, 0xa4, 0x81, 0xa7, 0xce, 0xe4, 0x81, 0xa6, 0x8e, 0xe4, + 0x7e, 0x59, 0x71, 0x1b, 0x03, 0x43, 0x26, 0x3e, 0x29, 0xd6, + 0xff, 0x60, 0x8b, 0x94, 0x72, 0x6c, 0x6c, 0xbe, 0x62, 0xfc, + 0x51, 0xa6, 0x3d, 0x62, 0x48, 0xab, 0x2d, 0xfe, 0x48, 0xab, + 0x2d, 0xfe, 0xb7, 0x54, 0xd2, 0x01, 0x2d, 0xa6, 0xdd, 0xf0, + 0x2a, 0x9a, 0xe3, 0x81, 0x8e, 0xe1, 0xf4, 0x42, 0x21, 0x10, + 0x66, 0x6b, 0xdd, 0xb5, 0x8e, 0x4c, 0x0e, 0x51, 0xcf, 0x72, + 0x0c, 0x71, 0xce, 0x72, 0xf3, 0x8e, 0x31, 0x8d, 0x71, 0x03, + 0xba, 0xc5, 0xc6, 0x42, 0xfe, 0xd4, 0xcc, 0x04, 0xb4, 0xd3, + 0x12, 0x99, 0xc5, 0x99, 0x58, 0x45, 0x3f, 0xe8, 0x99, 0xd9, + 0x1e, 0x59, 0x99, 0xcd, 0x9e, 0xd9, 0x66, 0x32, 0x61, 0x26, + 0x55, 0x01, 0xf8, 0xf1, 0xe3, 0x4e, 0xf6, 0x6d, 0x4e, 0x27, + 0xdf, 0x75, 0xb9, 0x69, 0x06, 0x99, 0x70, 0x84, 0x4a, 0x6a, + 0x36, 0xa6, 0x43, 0x6c, 0x34, 0x84, 0x42, 0x6c, 0xcb, 0x7b, + 0xbd, 0x93, 0x9e, 0x8f, 0xcb, 0xd2, 0xf9, 0x3a, 0xe5, 0x16, + 0x75, 0x34, 0x7b, 0xd0, 0x46, 0x4e, 0xfc, 0x82, 0xea, 0xab, + 0x23, 0x01, 0x4a, 0x39, 0x06, 0x02, 0x4a, 0x39, 0x26, 0x02, + 0xb5, 0xc6, 0xd9, 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x55, 0xfe, 0x07, 0xbc, + 0x1a, 0xd1, 0x1d, 0x62, 0xb7, 0xcb, 0xfb, 0xf2, 0x3a, 0xc4, + 0xa8, 0xb2, 0x90, 0x25, 0x12, 0x25, 0x9e, 0xb4, 0x92, 0x69, + 0x9e, 0xa5, 0x92, 0x69, 0x61, 0x5a, 0x6d, 0x96, 0x85, 0x2c, + 0xb7, 0x43, 0xbb, 0x58, 0x0f, 0xf6, 0x49, 0x10, 0x36, 0x28, + 0xc0, 0xde, 0xec, 0x29, 0x48, 0x46, 0xe7, 0x3f, 0x0c, 0x57, + 0xa6, 0x3b, 0x08, 0x57, 0xa6, 0x3b, 0xf7, 0xa8, 0x59, 0xc4, + 0x56, 0x88, 0x07, 0x77, 0x0d, 0x50, 0x41, 0x93, 0xa3, 0x6b, + 0x26, 0x85, 0x6d, 0xd8, 0xdd, 0x7b, 0x00, 0xd0, 0xa7, 0x19, + 0x8c, 0xa8, 0xc7, 0x89, 0x88, 0xa8, 0xc7, 0x89, 0x77, 0x57, + 0x38, 0x76, 0x7a, 0x99, 0xee, 0x29, 0x21, 0x04, 0x71, 0x8f, + 0x8a, 0x10, 0x3f, 0x6b, 0x7a, 0xe4, 0x8d, 0x74, 0x4f, 0x99, + 0xa1, 0x71, 0x37, 0xcc, 0x1d, 0x60, 0x37, 0xcc, 0x9d, 0x60, + 0xc8, 0x33, 0x62, 0x9f, 0x8c, 0x6c, 0xed, 0xb5, 0xa1, 0xb0, + 0x6a, 0x75, 0xb2, 0x79, 0xac, 0xba, 0x4a, 0x50, 0x9c, 0x08, + 0x34, 0x95, 0x21, 0x0c, 0x70, 0x99, 0xed, 0x8b, 0x70, 0x99, + 0xed, 0x8b, 0x8f, 0x66, 0x12, 0x74, 0xd1, 0x4f, 0x29, 0xd6, + 0x91, 0x18, 0xfb, 0x98, 0xc8, 0xfa, 0x08, 0x4c, 0xd9, 0x5f, + 0x17, 0x7e, 0x2d, 0x81, 0x9b, 0x9c, 0x9c, 0xa3, 0x13, 0x3d, + 0x9c, 0xa3, 0x13, 0x3d, 0x63, 0x5c, 0xec, 0xc2, 0x7c, 0x6f, + 0x6c, 0x95, 0xa4, 0xcf, 0xec, 0x02, 0x94, 0xa6, 0x86, 0xa3, + 0x3f, 0x69, 0x96, 0x3d, 0x1a, 0xdd, 0xd0, 0xf1, 0x10, 0x84, + 0xce, 0x21, 0x10, 0xc5, 0xce, 0x21, 0xef, 0x3a, 0x31, 0xde, + 0x3f, 0xda, 0x87, 0x1f, 0x1e, 0xb6, 0x69, 0x34, 0x95, 0x1c, + 0xd3, 0x67, 0xd6, 0xa0, 0x25, 0x98, 0x26, 0x1a, 0x4c, 0xb2, + 0x02, 0x2a, 0x1f, 0xae, 0x06, 0x2a, 0x1d, 0xaa, 0xf9, 0xd5, + 0xe2, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x89, 0x46, 0x8b, 0x02, 0xaf, 0x78, + 0x3d, 0x9f, 0x52, 0xf3, 0xa6, 0x4a, 0xbd, 0xc3, 0x39, 0x7f, + 0xc6, 0x6c, 0xd2, 0x95, 0xc8, 0x66, 0xf6, 0xfd, 0xc8, 0x66, + 0xf6, 0xfd, 0x37, 0x99, 0x09, 0x02, 0xbe, 0x4b, 0xf3, 0xd7, + 0x43, 0x58, 0x80, 0xf3, 0x0a, 0xeb, 0xd9, 0x24, 0x0e, 0x40, + 0x2b, 0x68, 0x93, 0x12, 0x7f, 0xf6, 0x96, 0x19, 0x2f, 0xd5, + 0x96, 0x13, 0x2d, 0xf6, 0x69, 0xec, 0xd2, 0x09, 0xea, 0xb8, + 0x07, 0x8e, 0x3d, 0xea, 0x71, 0x6b, 0x94, 0x2e, 0xf6, 0x1c, + 0xaf, 0x90, 0xd5, 0xd7, 0x15, 0x78, 0x1c, 0x9d, 0xa5, 0x18, + 0x1e, 0xc9, 0xa5, 0x18, 0x1c, 0xcd, 0x5a, 0xe7, 0xe3, 0x32, + 0xaa, 0x55, 0x62, 0x05, 0x0d, 0x14, 0xa6, 0x2a, 0x55, 0xbb, + 0x99, 0x0e, 0x82, 0x86, 0xe7, 0x4a, 0xf7, 0x92, 0xe9, 0x4b, + 0xa7, 0x71, 0xa9, 0x4b, 0xb7, 0x90, 0xa9, 0x4b, 0x48, 0x6f, + 0x56, 0xb4, 0xe2, 0x3a, 0x02, 0x50, 0xb7, 0xed, 0x73, 0xf0, + 0xee, 0x20, 0x1f, 0xf7, 0x41, 0x9b, 0xbb, 0x16, 0x96, 0x81, + 0xe1, 0x5b, 0xe3, 0x82, 0x82, 0x5e, 0x43, 0x82, 0x83, 0x5e, + 0xbc, 0x7d, 0x7c, 0xa1, 0xb3, 0x34, 0xda, 0x07, 0x6c, 0xdf, + 0xc0, 0x69, 0x43, 0x40, 0xf8, 0x39, 0x54, 0xd2, 0x27, 0xaf, + 0x11, 0x38, 0x3c, 0x06, 0xe0, 0x18, 0xb4, 0xae, 0xc0, 0x18, + 0x34, 0xae, 0x3f, 0xe7, 0xcb, 0x51, 0xe7, 0xca, 0x17, 0x92, + 0xd6, 0x51, 0x00, 0x27, 0xef, 0x54, 0x52, 0x97, 0xc3, 0xac, + 0x98, 0x5b, 0xf0, 0x1f, 0xb9, 0x88, 0xfd, 0x76, 0xd1, 0xa3, + 0xdd, 0x56, 0xd1, 0xa3, 0x22, 0xa9, 0x2e, 0x5c, 0xe6, 0x3b, + 0x8a, 0xeb, 0x1e, 0x1b, 0x31, 0xf4, 0xd1, 0x84, 0x2b, 0x09, + 0xbf, 0x71, 0xab, 0x4c, 0x86, 0x04, 0x64, 0x1b, 0x96, 0x30, + 0xe5, 0x28, 0x86, 0x30, 0xe1, 0x28, 0x79, 0xcf, 0x1e, 0xd7, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0xad, 0x70, 0x07, 0x15, 0x41, 0xe2, 0xda, 0xf8, + 0x5f, 0xfa, 0x29, 0x0b, 0x74, 0xd5, 0x70, 0x2e, 0x25, 0xac, + 0xc1, 0xbe, 0x43, 0x9b, 0xc0, 0x84, 0x47, 0x99, 0xc0, 0x84, + 0xb8, 0x66, 0x3f, 0x7b, 0x86, 0xe6, 0x84, 0x37, 0x13, 0xf4, + 0x73, 0x2b, 0x32, 0x37, 0xa9, 0x86, 0x87, 0x2f, 0x4e, 0xd1, + 0x53, 0x3d, 0xc6, 0x78, 0x53, 0xb7, 0x77, 0x7a, 0x53, 0x37, + 0x77, 0x78, 0xac, 0xc8, 0x88, 0x87, 0x2c, 0x1e, 0xe1, 0xfa, + 0xb6, 0xbb, 0x93, 0x32, 0x33, 0xe2, 0x1a, 0xe0, 0x14, 0x80, + 0xe5, 0xa4, 0x1d, 0x92, 0x2a, 0x11, 0x5c, 0xba, 0x49, 0x29, + 0x5c, 0xb2, 0x49, 0x29, 0xa3, 0x4d, 0xb6, 0xd6, 0x04, 0xd3, + 0x44, 0x0e, 0x03, 0x98, 0x01, 0x01, 0xec, 0xf0, 0xf4, 0xb8, + 0xc3, 0x81, 0x00, 0x66, 0xbc, 0x34, 0x46, 0xcf, 0x51, 0x01, + 0x17, 0x0f, 0xd1, 0x21, 0x16, 0x0f, 0x2e, 0xde, 0xe9, 0xf0, + 0x8e, 0x46, 0x50, 0x4f, 0x38, 0x65, 0x3c, 0xf8, 0x30, 0xc5, + 0x6d, 0x04, 0x18, 0x23, 0x95, 0xb4, 0xb8, 0x9d, 0x9e, 0x9c, + 0x7a, 0xad, 0x94, 0x5b, 0x7a, 0xad, 0x94, 0x99, 0x85, 0x52, + 0x6b, 0x66, 0x5d, 0x7d, 0xbe, 0x84, 0xdf, 0xa9, 0xa8, 0xef, + 0xd6, 0x05, 0x06, 0x34, 0xcc, 0x56, 0x9f, 0x5c, 0x76, 0x96, + 0x25, 0x99, 0x3e, 0x5d, 0xb7, 0x94, 0x3e, 0x5c, 0x37, 0x9c, + 0xc1, 0xa3, 0xc8, 0x63, 0x84, 0x7c, 0x30, 0x1e, 0x2d, 0x81, + 0x43, 0x76, 0xb9, 0xdf, 0x6a, 0x4b, 0x53, 0xd7, 0xb4, 0x83, + 0x11, 0xfb, 0x1b, 0xe5, 0x0b, 0xe3, 0xe3, 0x61, 0x0b, 0xe3, + 0x73, 0x65, 0xf4, 0x1c, 0x8c, 0x9a, 0x42, 0xb0, 0x81, 0xf5, + 0x3f, 0xd6, 0x5e, 0x2b, 0x09, 0xd1, 0x68, 0x78, 0xf6, 0x7c, + 0x34, 0x97, 0xb3, 0x72, 0xf5, 0x7b, 0x82, 0xb8, 0x7c, 0x1d, + 0x82, 0xb8, 0x7c, 0x5d, 0x7d, 0x47, 0x83, 0xa2, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x42, 0x78, 0x38, 0x30, 0xe9, 0x08, 0x72, 0x8b, 0x90, 0xd1, + 0x9b, 0xa8, 0x8d, 0xe8, 0x21, 0xd0, 0x7a, 0xf5, 0x97, 0xe7, + 0xfc, 0xf1, 0x37, 0x39, 0xfc, 0xf1, 0x37, 0x39, 0x03, 0x0e, + 0xc8, 0xc6, 0xa8, 0x3a, 0x07, 0x1f, 0xc9, 0x89, 0x53, 0xa3, + 0x67, 0x07, 0xe8, 0xd0, 0x18, 0x6d, 0x74, 0x14, 0xcf, 0xbd, + 0xd8, 0x4a, 0x27, 0x2e, 0xd8, 0x0e, 0x27, 0x2e, 0xd8, 0x0e, + 0xd8, 0xd1, 0x27, 0xf1, 0x20, 0x4d, 0x61, 0xe8, 0xb8, 0x12, + 0x70, 0xe7, 0x17, 0x55, 0x50, 0x4b, 0x5c, 0x9c, 0x4e, 0xd7, + 0xe5, 0x82, 0x8e, 0x6c, 0xce, 0x22, 0x5e, 0x13, 0xc4, 0x02, + 0x5e, 0x53, 0x3b, 0xfd, 0xa1, 0xac, 0x83, 0xaf, 0x89, 0xf6, + 0x1d, 0x2f, 0x00, 0x69, 0xe9, 0xc2, 0x12, 0xbe, 0xca, 0x32, + 0xbc, 0xd8, 0xcc, 0xb0, 0x18, 0x2f, 0x64, 0xb3, 0x19, 0xa5, + 0x6c, 0xb2, 0x19, 0xa5, 0x93, 0x4d, 0xe6, 0x5a, 0xbf, 0x77, + 0xce, 0x27, 0xb7, 0xd3, 0x92, 0xf1, 0x5a, 0x95, 0x4e, 0x05, + 0x60, 0x79, 0x74, 0xf9, 0x90, 0xf8, 0xb5, 0x6a, 0x16, 0xd9, + 0x35, 0x65, 0x16, 0xd9, 0xb5, 0x65, 0xe9, 0x26, 0x4a, 0x9a, + 0xa8, 0xac, 0x85, 0xb5, 0xe3, 0x29, 0x9a, 0xb9, 0xbc, 0x19, + 0xa6, 0xe1, 0x7e, 0xd0, 0xe6, 0x76, 0xad, 0x7b, 0xeb, 0x09, + 0x5d, 0x51, 0xe3, 0x0e, 0xdd, 0x51, 0xe3, 0x0c, 0x22, 0xae, + 0x1c, 0xf3, 0x3a, 0x2a, 0xb9, 0x43, 0xfd, 0x01, 0x22, 0x4a, + 0x5c, 0xc7, 0xff, 0x62, 0xe6, 0x8c, 0xdd, 0x14, 0x57, 0x22, + 0x1c, 0xff, 0x74, 0x07, 0x0d, 0xf1, 0x74, 0x27, 0x1d, 0xf7, + 0x8b, 0xd8, 0xe2, 0x08, 0x4a, 0x4e, 0x03, 0xa9, 0xa5, 0xff, + 0x80, 0xfa, 0x45, 0x34, 0x7e, 0xde, 0x4c, 0x65, 0x0e, 0x86, + 0xae, 0xc1, 0xe9, 0x00, 0x61, 0xc0, 0x42, 0xc2, 0xe1, 0xe0, + 0x42, 0xc2, 0x1e, 0x1f, 0xbd, 0x3d, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2d, 0xcf, + 0xaf, 0x90, 0x1a, 0xc5, 0x29, 0x00, 0x03, 0x62, 0xfb, 0xdf, + 0x4d, 0x4b, 0xdf, 0x21, 0x43, 0xe1, 0x2b, 0x0a, 0x86, 0x49, + 0xfe, 0x8c, 0x87, 0x49, 0xfe, 0x8c, 0x78, 0xb6, 0x01, 0x73, + 0xf6, 0x3f, 0x6f, 0xd2, 0xe2, 0xf7, 0x78, 0x40, 0xb9, 0xd6, + 0xe0, 0xb4, 0x21, 0x29, 0x0e, 0x8c, 0x83, 0xcc, 0xcf, 0xb2, + 0xa3, 0xd3, 0x4f, 0xb8, 0xa3, 0xd3, 0x4f, 0xb8, 0x5c, 0x2c, + 0xb0, 0x47, 0x42, 0xaf, 0x60, 0x7a, 0x4a, 0x01, 0xf7, 0xea, + 0xdb, 0x4f, 0xf9, 0xd6, 0xd9, 0x75, 0xe1, 0x68, 0xcc, 0x03, + 0xf5, 0x67, 0xb8, 0x95, 0x34, 0x67, 0xd8, 0x15, 0xf4, 0x67, + 0x27, 0xea, 0x0b, 0x98, 0x5c, 0x21, 0x3f, 0x10, 0x7c, 0xf6, + 0x39, 0x60, 0x9c, 0x3c, 0xb2, 0x6b, 0x01, 0x2f, 0x0b, 0x8b, + 0xd4, 0x24, 0xbf, 0x0b, 0x96, 0x0c, 0xab, 0x03, 0x96, 0x0c, + 0xbb, 0x8b, 0x69, 0xf3, 0x44, 0x74, 0xff, 0xec, 0xa7, 0x4e, + 0x06, 0x09, 0x49, 0x8a, 0x3f, 0xc6, 0x0f, 0x2e, 0x55, 0x98, + 0xd2, 0x44, 0x04, 0xae, 0x2e, 0xa0, 0xd4, 0x86, 0x6e, 0xa8, + 0x54, 0x8e, 0x6e, 0xa8, 0xab, 0x71, 0x91, 0x57, 0xeb, 0x8d, + 0xe3, 0x92, 0x9d, 0x42, 0xa5, 0x88, 0x53, 0xd1, 0xfe, 0x8b, + 0x98, 0xef, 0xe3, 0x0b, 0xf6, 0x41, 0x1f, 0xed, 0x50, 0x4b, + 0x9f, 0x5f, 0x50, 0x4b, 0x9f, 0x4d, 0xaf, 0xb4, 0x60, 0xb2, + 0x95, 0xa2, 0x74, 0x96, 0xa2, 0x13, 0xc0, 0x3a, 0x16, 0x37, + 0x67, 0xb5, 0xa6, 0xde, 0x74, 0x1e, 0x26, 0xba, 0x8d, 0x3c, + 0xa4, 0x5f, 0x9f, 0x34, 0xa4, 0x5f, 0x9f, 0x34, 0x5b, 0xa0, + 0x60, 0xcb, 0xd6, 0xd7, 0x42, 0xf8, 0x12, 0x3c, 0x4b, 0xe4, + 0xca, 0xf0, 0xd2, 0x55, 0x6f, 0xfe, 0xa5, 0x6b, 0x64, 0xfd, + 0xc3, 0x1b, 0xe9, 0x77, 0xce, 0x1b, 0xe9, 0x77, 0xce, 0x1b, + 0x16, 0x88, 0x31, 0xe4, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5e, 0xcd, 0xe5, 0x27, + 0x6b, 0x60, 0xe6, 0x30, 0xc4, 0xd6, 0xbb, 0xd8, 0xbe, 0x1d, + 0x9b, 0xa8, 0xde, 0x19, 0xec, 0x31, 0xfe, 0x9c, 0xe4, 0xa3, + 0xfe, 0x1c, 0xe4, 0xa0, 0x01, 0xe3, 0x1b, 0x5f, 0x4e, 0x58, + 0xd8, 0x86, 0x9a, 0xa5, 0xc2, 0x00, 0x86, 0xee, 0x79, 0x65, + 0x24, 0xa4, 0x73, 0xcd, 0xad, 0xba, 0x27, 0x03, 0x06, 0x39, + 0x6b, 0x81, 0x06, 0x39, 0x63, 0x81, 0xf9, 0xc6, 0x9c, 0x7e, + 0x9d, 0xc6, 0xb2, 0xe3, 0xa3, 0xcf, 0xc4, 0x58, 0xd8, 0x61, + 0x76, 0xaa, 0x78, 0x03, 0x07, 0xef, 0xa1, 0x9b, 0xff, 0xf0, + 0xb1, 0x2b, 0x5f, 0xd8, 0xb1, 0x2b, 0x5f, 0xc8, 0x4e, 0xd4, + 0xa0, 0x37, 0x77, 0x6a, 0x53, 0x55, 0x2a, 0xbb, 0xf6, 0x31, + 0x0e, 0x70, 0x60, 0x94, 0x58, 0x33, 0x4d, 0x49, 0xef, 0xea, + 0x71, 0xdf, 0x64, 0x10, 0x12, 0x4b, 0x6e, 0x10, 0x10, 0x4b, + 0x91, 0xef, 0xef, 0xb4, 0xb7, 0xdc, 0xd2, 0x98, 0x57, 0x81, + 0x3c, 0xd2, 0xe2, 0x43, 0xb8, 0xa2, 0xad, 0xa4, 0xf5, 0x08, + 0x19, 0xb7, 0xb2, 0x6d, 0x6b, 0xbc, 0xbb, 0x0e, 0x69, 0xb4, + 0xbb, 0x0e, 0x96, 0x4b, 0x44, 0xf1, 0x45, 0x3c, 0x05, 0x69, + 0x15, 0x18, 0xf9, 0xec, 0x04, 0x36, 0x72, 0x17, 0x43, 0x4d, + 0x2c, 0x88, 0x54, 0x74, 0xdb, 0xf3, 0x60, 0xf4, 0xbd, 0xba, + 0x40, 0x74, 0xbd, 0xaa, 0xbf, 0x8b, 0x42, 0x55, 0xff, 0xf8, + 0x65, 0xd2, 0x7b, 0x20, 0x2d, 0x3b, 0xa6, 0x4b, 0x66, 0x56, + 0xe3, 0xe3, 0x23, 0x87, 0xe0, 0x30, 0x2d, 0xbe, 0x7c, 0x61, + 0x2d, 0xcb, 0x7c, 0x61, 0x2d, 0x8b, 0x83, 0x9e, 0xd2, 0x74, + 0xb6, 0xc4, 0xb4, 0xf7, 0xf0, 0x9c, 0xe5, 0x40, 0x60, 0x9c, + 0x9c, 0x1b, 0xee, 0xe4, 0x8f, 0x88, 0x10, 0xe2, 0x91, 0x31, + 0x50, 0xd5, 0x9c, 0x50, 0x50, 0xd5, 0x9c, 0x10, 0xaf, 0x2a, + 0x63, 0xef, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xd0, 0x71, 0xc5, 0x4a, 0x13, 0x27, + 0xe6, 0x0e, 0x3d, 0x53, 0x17, 0x33, 0x86, 0xf7, 0x72, 0xd8, + 0x96, 0x12, 0xc1, 0x58, 0x52, 0x5a, 0xd1, 0x98, 0x52, 0x5b, + 0xd1, 0x98, 0xad, 0xa4, 0x2e, 0x67, 0xf3, 0xcd, 0x40, 0x44, + 0xbd, 0x28, 0x21, 0x89, 0xa6, 0xb4, 0x9a, 0xb8, 0x30, 0xae, + 0xe8, 0xee, 0x2d, 0x39, 0xe2, 0x2e, 0x6f, 0x7c, 0xf1, 0x5c, + 0x6f, 0x78, 0xe3, 0x6e, 0x90, 0x87, 0x1c, 0x91, 0xc9, 0x8d, + 0xc2, 0x64, 0x2b, 0x03, 0x86, 0x5c, 0x94, 0xa8, 0xe6, 0x7a, + 0xf7, 0x40, 0x86, 0xbe, 0xbb, 0x61, 0x1d, 0xe0, 0x99, 0xf9, + 0xbd, 0xb0, 0x99, 0xf9, 0xbd, 0xb0, 0x66, 0x06, 0x42, 0x4f, + 0xc6, 0xb5, 0xef, 0xf2, 0xcb, 0x0c, 0xae, 0x1b, 0x09, 0x8a, + 0x03, 0xcd, 0xb3, 0x7b, 0x22, 0x02, 0x76, 0xe8, 0xab, 0x32, + 0x74, 0xcc, 0x22, 0x26, 0x76, 0xcc, 0x22, 0x22, 0x89, 0x33, + 0xdd, 0xdd, 0xcb, 0xca, 0xe4, 0x24, 0x0f, 0x9b, 0x44, 0xb0, + 0x0c, 0xec, 0x1d, 0x24, 0xe5, 0xc1, 0x47, 0xb3, 0xbc, 0x60, + 0xad, 0x51, 0xe2, 0xe2, 0x2d, 0x5d, 0xe0, 0xe2, 0x2d, 0x55, + 0x1f, 0x1d, 0xd2, 0xaa, 0x78, 0x65, 0x42, 0x20, 0xd9, 0xc6, + 0x7f, 0x58, 0x02, 0x8b, 0x8b, 0x54, 0x64, 0xea, 0xc5, 0x56, + 0x16, 0x04, 0x75, 0xf0, 0x74, 0x9c, 0xcc, 0x80, 0x54, 0x8e, + 0xc5, 0xf0, 0xab, 0x71, 0x3a, 0x0f, 0x31, 0x9e, 0x75, 0x1e, + 0x41, 0x8f, 0x48, 0x84, 0x25, 0x09, 0xd0, 0x30, 0xc0, 0x2f, + 0xda, 0xba, 0x0e, 0x33, 0xb8, 0xea, 0x46, 0x2d, 0xf4, 0x73, + 0x46, 0x2f, 0xf4, 0xf3, 0xb9, 0xd0, 0x0b, 0x0c, 0x2a, 0x2c, + 0xab, 0x38, 0x2c, 0xa1, 0x4e, 0xac, 0x04, 0x56, 0xf1, 0x8a, + 0xca, 0xc3, 0x14, 0xb6, 0x15, 0x83, 0x73, 0x6d, 0xb7, 0x91, + 0x18, 0x47, 0x97, 0x93, 0x10, 0x47, 0x68, 0x6c, 0xef, 0xb8, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x5d, 0x80, 0xb1, 0x27, 0xb4, 0xb2, 0x4b, 0x0c, + 0x6a, 0xaa, 0x6d, 0x8a, 0xf7, 0xae, 0xb4, 0x61, 0xb2, 0x81, + 0x47, 0x95, 0xf4, 0x8f, 0xc0, 0x83, 0xf4, 0x8b, 0xc0, 0x87, + 0x0b, 0x74, 0x3f, 0x78, 0x8b, 0xc9, 0x6d, 0x3d, 0x54, 0x1e, + 0x54, 0xdd, 0x68, 0x2b, 0xf7, 0xb2, 0x86, 0x79, 0xdb, 0x0a, + 0xdd, 0x1b, 0x4a, 0xb4, 0xe0, 0x06, 0xa7, 0x14, 0xe0, 0x0e, + 0xa7, 0x14, 0x1f, 0xf1, 0x58, 0xeb, 0x66, 0xf8, 0x4f, 0x13, + 0x43, 0x34, 0x4c, 0x27, 0x1d, 0xee, 0xe6, 0x19, 0x14, 0xc8, + 0x9d, 0x41, 0x68, 0x71, 0xba, 0xe5, 0x7f, 0x15, 0xac, 0xa9, + 0x7d, 0x35, 0xae, 0xe9, 0x82, 0xca, 0x51, 0x16, 0xe0, 0xaf, + 0x12, 0x83, 0xac, 0x0c, 0x63, 0x31, 0x0f, 0xb9, 0x00, 0x51, + 0x0e, 0x27, 0xac, 0x84, 0x12, 0x1e, 0x02, 0x4e, 0x0c, 0x2f, + 0x61, 0xce, 0x0c, 0x2f, 0x61, 0x4e, 0xf3, 0xd0, 0x9e, 0xb1, + 0x7c, 0x5b, 0xcb, 0xde, 0x06, 0x2c, 0x19, 0xeb, 0x19, 0x2e, + 0x71, 0x96, 0xc4, 0xac, 0x44, 0x03, 0x5d, 0x1b, 0xb6, 0xae, + 0x45, 0xbe, 0x97, 0x8f, 0x45, 0x3e, 0x97, 0x8f, 0xba, 0xc1, + 0x68, 0x70, 0x46, 0x67, 0xfa, 0x4b, 0x16, 0xd1, 0x80, 0x61, + 0x87, 0xdd, 0x7c, 0x75, 0x0b, 0xe0, 0x48, 0xf4, 0xa6, 0x0b, + 0xe1, 0x02, 0x0b, 0x8a, 0x61, 0x40, 0x8f, 0x8b, 0x61, 0x40, + 0x70, 0x74, 0x9e, 0xbf, 0x7b, 0xfb, 0xcb, 0x91, 0xf4, 0x67, + 0x6c, 0x57, 0xf6, 0x16, 0x4f, 0x75, 0x42, 0x28, 0xbf, 0x0e, + 0x62, 0x69, 0x70, 0xba, 0x4a, 0x2a, 0x60, 0xa0, 0x4a, 0x28, + 0x60, 0xa2, 0xb5, 0xd7, 0x9f, 0x5d, 0xde, 0x3d, 0x13, 0x11, + 0xb6, 0xad, 0x07, 0xf7, 0x65, 0xbf, 0xb0, 0xb6, 0xd6, 0x10, + 0x28, 0xf7, 0x9b, 0x62, 0x0b, 0x48, 0x53, 0xb4, 0x2b, 0x33, + 0xd3, 0xb4, 0x2b, 0x73, 0x2c, 0x4b, 0xd4, 0x8c, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xbb, 0x06, 0xf8, 0xf4, 0xe2, 0x7f, 0x58, 0xb0, 0x4c, 0x95, + 0xc4, 0x15, 0x18, 0xd4, 0x3c, 0xd2, 0x2f, 0xe7, 0x0d, 0x13, + 0x19, 0x67, 0x0f, 0x50, 0x39, 0x67, 0x0c, 0x50, 0xc6, 0x98, + 0xf3, 0xaf, 0xca, 0x03, 0x23, 0x31, 0x22, 0xf6, 0xe4, 0xc6, + 0x43, 0x06, 0xd3, 0x04, 0x83, 0xf9, 0x2b, 0xe0, 0x5a, 0x80, + 0x4c, 0x36, 0x12, 0xf0, 0xc1, 0x77, 0x12, 0xd1, 0xc9, 0x77, + 0xed, 0x2e, 0x36, 0x88, 0xe9, 0x74, 0x24, 0xbb, 0x6a, 0x18, + 0x27, 0x66, 0xfb, 0xab, 0x3c, 0x45, 0xef, 0xba, 0x9c, 0x13, + 0xe7, 0xe3, 0xe2, 0x8e, 0x62, 0x97, 0x76, 0x03, 0x62, 0xb3, + 0xf6, 0x03, 0x9d, 0x4c, 0x09, 0xfc, 0x3b, 0x18, 0x69, 0x09, + 0xc9, 0xb5, 0x86, 0x0b, 0xb5, 0xc5, 0xe9, 0x2f, 0xd8, 0x24, + 0x62, 0x33, 0x91, 0x04, 0x64, 0xf8, 0x90, 0x01, 0x14, 0xfa, + 0x90, 0x01, 0x20, 0xfa, 0x6f, 0xfe, 0xdf, 0x05, 0x8a, 0x3f, + 0x8b, 0x7e, 0xad, 0x86, 0xaf, 0x6e, 0x24, 0x8a, 0x72, 0xc7, + 0x72, 0xea, 0x7b, 0xf6, 0x20, 0x25, 0x44, 0xb7, 0xa2, 0xb0, + 0x76, 0xc7, 0xa2, 0xa0, 0x76, 0xd7, 0x5d, 0x5f, 0x89, 0x28, + 0x3e, 0xa2, 0x76, 0x94, 0x65, 0x98, 0x57, 0xe7, 0x89, 0xb6, + 0x26, 0x29, 0x9c, 0xfb, 0xb6, 0xa6, 0xa2, 0x7e, 0xfa, 0xe2, + 0xb0, 0xf4, 0xe9, 0xe6, 0xb0, 0xf4, 0xe9, 0xe6, 0x4f, 0x0b, + 0x16, 0x19, 0x4d, 0xa5, 0x55, 0xd0, 0x1f, 0x4d, 0x7b, 0x8d, + 0xda, 0x14, 0xd1, 0xa6, 0x68, 0x62, 0xbb, 0xe5, 0x22, 0xba, + 0x2c, 0xc5, 0xc8, 0x7e, 0x7c, 0x49, 0xc8, 0x7a, 0x7c, 0x4d, + 0x37, 0x85, 0x83, 0xb2, 0xc3, 0x5e, 0x01, 0xcd, 0x9c, 0x11, + 0xf3, 0x2d, 0x7a, 0xee, 0x11, 0x31, 0xdc, 0xab, 0x44, 0xc4, + 0x51, 0x14, 0xb3, 0x77, 0x48, 0xb4, 0x45, 0x01, 0x58, 0xb4, + 0x45, 0x41, 0xa7, 0x4b, 0xba, 0xbe, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xbe, 0xc5, + 0x05, 0xaf, 0x59, 0x37, 0xee, 0x47, 0xeb, 0x14, 0xa1, 0x64, + 0xd2, 0x71, 0x6a, 0x0a, 0x66, 0x20, 0x65, 0xdf, 0xc6, 0x29, + 0x75, 0x22, 0x46, 0x21, 0x75, 0x26, 0xb9, 0xde, 0x8a, 0xd9, + 0x70, 0x38, 0x0c, 0x91, 0x5f, 0x36, 0x4e, 0xfd, 0x6f, 0x1f, + 0x8b, 0x2d, 0x97, 0xe7, 0x26, 0x1e, 0x06, 0x4f, 0x7c, 0xbe, + 0x02, 0xe7, 0xb5, 0xae, 0x82, 0xe7, 0xb4, 0xbe, 0x7d, 0x18, + 0x4b, 0x41, 0xed, 0xf3, 0x96, 0x19, 0xef, 0x3d, 0x82, 0xea, + 0xd6, 0x41, 0xb4, 0x36, 0xd4, 0x62, 0x5b, 0xf6, 0x85, 0xaf, + 0x52, 0xf8, 0xae, 0x61, 0x5e, 0xa7, 0xac, 0x61, 0x5e, 0xa7, + 0x53, 0x9e, 0xa1, 0x58, 0x63, 0x0e, 0x62, 0xd3, 0x65, 0x9c, + 0x9e, 0x88, 0x23, 0xdd, 0xa4, 0xc0, 0x8c, 0x63, 0xdd, 0xd9, + 0x45, 0xde, 0x60, 0x0f, 0xd6, 0x46, 0x02, 0xa3, 0x54, 0x4e, + 0x02, 0xa3, 0xab, 0xb1, 0xfd, 0x5c, 0xef, 0x16, 0x95, 0xbd, + 0x36, 0x9d, 0x3a, 0xd2, 0x62, 0x0c, 0x58, 0x20, 0x86, 0x9f, + 0x1d, 0xb8, 0xbf, 0x75, 0x78, 0xa6, 0x17, 0x7c, 0x59, 0x21, + 0x97, 0x7c, 0x59, 0x25, 0x68, 0x83, 0xa6, 0xda, 0x5f, 0xd0, + 0x64, 0xf4, 0x18, 0x66, 0x4e, 0xbf, 0x5b, 0xfc, 0x3f, 0x39, + 0xbe, 0xf4, 0x88, 0xe1, 0xe4, 0x44, 0x5a, 0xd6, 0xf7, 0x67, + 0x1a, 0x12, 0xf7, 0x44, 0x1a, 0x12, 0x08, 0xbb, 0xe5, 0xed, + 0xbc, 0xf8, 0xe7, 0x64, 0xf8, 0xf0, 0xa5, 0xc5, 0x00, 0xc5, + 0xf5, 0x39, 0x05, 0x8a, 0xc1, 0x6d, 0x29, 0xfd, 0x8e, 0x4d, + 0x68, 0x94, 0xc7, 0x6d, 0x28, 0x94, 0xc7, 0x6d, 0xd7, 0x6b, + 0x38, 0x92, 0x5a, 0xad, 0x07, 0xc2, 0x9b, 0x86, 0xe7, 0x02, + 0x5f, 0x10, 0xef, 0x0d, 0xe7, 0xf2, 0x98, 0x78, 0x3d, 0xe8, + 0xe4, 0x0f, 0x65, 0xc8, 0xd0, 0x7b, 0x65, 0xc8, 0xd0, 0x5b, + 0x9a, 0x37, 0x2f, 0xa4, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd0, 0xce, 0xb3, 0x2b, + 0x6c, 0xd6, 0x8b, 0x4e, 0xf2, 0xab, 0xeb, 0x05, 0xb0, 0xe7, + 0xcc, 0xd2, 0x15, 0xfe, 0x97, 0x4d, 0x86, 0xbe, 0xdf, 0xcd, + 0x86, 0xbe, 0xcf, 0xcd, 0x79, 0x41, 0x30, 0x32, 0x9c, 0xa9, + 0x61, 0x00, 0x3e, 0x63, 0xb4, 0x5b, 0x38, 0xf2, 0x36, 0x88, + 0x07, 0xb6, 0x64, 0x45, 0x92, 0xe4, 0x4d, 0xa2, 0x00, 0xb7, + 0x4d, 0x2c, 0x00, 0xb7, 0x4d, 0xac, 0xff, 0x48, 0xb2, 0x53, + 0xcf, 0x72, 0x93, 0xb4, 0x91, 0xbc, 0x1b, 0x03, 0x01, 0x7a, + 0xf9, 0x37, 0x06, 0xcf, 0x5c, 0x8e, 0x2f, 0xc9, 0x76, 0xb3, + 0x18, 0x7a, 0x76, 0xfc, 0x18, 0xba, 0x76, 0xbe, 0xe7, 0x05, + 0x89, 0x41, 0x6a, 0x72, 0x3f, 0xb8, 0x16, 0xfd, 0xf3, 0xdc, + 0xe4, 0x97, 0xbc, 0xcc, 0x12, 0x9b, 0x07, 0x02, 0xaf, 0xd9, + 0xdb, 0x92, 0x8d, 0x59, 0xc7, 0x06, 0x8d, 0xd9, 0xc7, 0x06, + 0x72, 0x26, 0x38, 0xf9, 0xcd, 0x73, 0xe4, 0x35, 0x7a, 0x80, + 0xf9, 0xfb, 0x09, 0x53, 0xe6, 0x44, 0x54, 0x9c, 0x24, 0x13, + 0xc8, 0xe3, 0x82, 0xe1, 0x48, 0xca, 0x94, 0x63, 0x48, 0xca, + 0x84, 0x63, 0xb7, 0x35, 0x7b, 0x9c, 0x41, 0xbb, 0x1b, 0xf7, + 0x2c, 0xfe, 0x1e, 0x13, 0x65, 0x39, 0xde, 0x26, 0x46, 0x50, + 0xf2, 0xa1, 0x00, 0x0b, 0xd5, 0x79, 0x54, 0x43, 0x15, 0x2a, + 0x44, 0x43, 0x15, 0x6b, 0xbb, 0xbc, 0xea, 0x94, 0xa3, 0x44, + 0xb6, 0xc5, 0x53, 0xaa, 0xcd, 0xa0, 0xe3, 0x06, 0xeb, 0x20, + 0x73, 0xc4, 0x45, 0x0d, 0x1a, 0xba, 0xda, 0x20, 0xd3, 0xe2, + 0xba, 0x64, 0xd3, 0xe2, 0xba, 0x20, 0x2c, 0x1d, 0x45, 0xdf, + 0x34, 0x47, 0x05, 0x41, 0x1c, 0xb3, 0x04, 0x02, 0x19, 0xa1, + 0xa4, 0x04, 0x7f, 0x75, 0xc3, 0xc7, 0xf9, 0xaf, 0xac, 0x8f, + 0xd9, 0xf3, 0x4f, 0xb7, 0xd9, 0xf3, 0x4f, 0x97, 0x26, 0x0c, + 0xb0, 0x68, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x13, 0xb9, 0xd8, 0x0b, 0xa9, 0xaf, + 0x83, 0x19, 0xb5, 0xea, 0x75, 0x81, 0xb1, 0xd3, 0x52, 0x95, + 0x60, 0x4b, 0x18, 0xde, 0x74, 0x9f, 0x98, 0x9d, 0x74, 0xdf, + 0x18, 0x9d, 0x8b, 0x20, 0xe7, 0x62, 0x82, 0x65, 0xb3, 0x18, + 0x0c, 0xa4, 0x3b, 0xfb, 0x39, 0xff, 0xe7, 0x47, 0x7b, 0x1d, + 0xb6, 0xfe, 0x7b, 0xb3, 0xa6, 0xda, 0x53, 0xb5, 0x66, 0xdb, + 0x5b, 0xb5, 0x66, 0xdb, 0xa4, 0x4a, 0x99, 0x24, 0x01, 0x3f, + 0xf3, 0xc0, 0x60, 0x70, 0x14, 0xd2, 0xe0, 0xf8, 0xc4, 0x31, + 0xc9, 0x20, 0xc1, 0x69, 0x35, 0x30, 0x56, 0xc9, 0x84, 0x28, + 0x4d, 0x4a, 0x8c, 0x28, 0x4d, 0x4a, 0x73, 0xd7, 0xb2, 0xb5, + 0x3f, 0x6e, 0x45, 0xd9, 0xa0, 0x54, 0x23, 0x06, 0x23, 0xcd, + 0x8f, 0x76, 0x39, 0x7d, 0xa7, 0x4d, 0x25, 0x90, 0x1e, 0x6b, + 0x21, 0xdf, 0xf3, 0x29, 0x21, 0xd7, 0xfb, 0x69, 0xde, 0x28, + 0x04, 0x96, 0x87, 0x17, 0xd5, 0x68, 0x9c, 0x09, 0x04, 0xce, + 0x1c, 0xc2, 0x2e, 0xed, 0xe5, 0xe2, 0xf5, 0x61, 0x52, 0x34, + 0x4f, 0x27, 0x5a, 0x66, 0x43, 0x21, 0x7a, 0x66, 0x43, 0x21, + 0x85, 0x99, 0xbc, 0xde, 0xb3, 0x1a, 0x43, 0x87, 0x0a, 0xd3, + 0xda, 0xc4, 0xa9, 0xfd, 0x54, 0xd3, 0x60, 0xa0, 0x2c, 0x41, + 0x75, 0x74, 0x20, 0x25, 0x68, 0x68, 0x2c, 0x03, 0x68, 0x60, + 0x2c, 0x03, 0x97, 0x9f, 0xd3, 0xfc, 0xec, 0x2a, 0x7c, 0x2e, + 0x7b, 0x70, 0x1b, 0x98, 0xc6, 0xa7, 0xd2, 0x32, 0xbd, 0x35, + 0xff, 0x14, 0x27, 0x7d, 0x1a, 0x50, 0x8e, 0x61, 0x3e, 0x50, + 0x8f, 0x61, 0x3e, 0x50, 0x70, 0x9e, 0xc1, 0xaf, 0x06, 0x1f, + 0xe1, 0xf8, 0x97, 0xab, 0x39, 0x0c, 0x51, 0x48, 0x88, 0x2a, + 0xa4, 0xdf, 0x5c, 0x42, 0x5e, 0xee, 0x8a, 0xec, 0x5c, 0xef, + 0xcc, 0xdc, 0x5c, 0xef, 0xcc, 0xdc, 0xa3, 0x10, 0x33, 0x23, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00 +}; + diff --git a/parallel_bare_tests/Dijkstra/Dijkstra.c b/parallel_bare_tests/Dijkstra/Dijkstra.c index a6f6cab..8fe903d 100644 --- a/parallel_bare_tests/Dijkstra/Dijkstra.c +++ b/parallel_bare_tests/Dijkstra/Dijkstra.c @@ -96,8 +96,8 @@ int main() #endif if ( hc == 2 && id == 0 ) { - reset_timer(); - start_timer(); + reset_timer(rt_cluster_id()); + start_timer(rt_cluster_id()); } synch_barrier(); @@ -105,8 +105,8 @@ int main() dijkstra_distance(mind, ohd); if ( hc == 2 && id == 0 ) { - stop_timer(); - time = get_time(); + stop_timer(rt_cluster_id()); + time = get_time(rt_cluster_id()); } #ifdef PROFILE // stop performance counters diff --git a/parallel_bare_tests/LU/LU.c b/parallel_bare_tests/LU/LU.c index 3700837..d6ddc81 100644 --- a/parallel_bare_tests/LU/LU.c +++ b/parallel_bare_tests/LU/LU.c @@ -174,8 +174,8 @@ int main(int argc, char **argv) for(hc = 0; hc < 3; ++hc) { if ( hc == 2 && id == 0 ) { - reset_timer(); - start_timer(); + reset_timer(rt_cluster_id()); + start_timer(rt_cluster_id()); } perf_reset(); perf_start(); @@ -185,8 +185,8 @@ int main(int argc, char **argv) factor(G, N, N, pivots); if ( hc == 2 && id == 0 ) { - stop_timer(); - time = get_time(); + stop_timer(rt_cluster_id()); + time = get_time(rt_cluster_id()); } perf_stop(); diff --git a/parallel_bare_tests/conv16/conv16.c b/parallel_bare_tests/conv16/conv16.c index c4d95a2..38c3a92 100644 --- a/parallel_bare_tests/conv16/conv16.c +++ b/parallel_bare_tests/conv16/conv16.c @@ -89,10 +89,10 @@ int test_singlethread(void (*test)(int16_t *, int16_t *, int16_t *, int, int, in if(rt_core_id() == 0) { load(); - reset_timer(); - start_timer(); + reset_timer(rt_cluster_id()); + start_timer(rt_cluster_id()); test(g_W, g_x, g_y, IH, IW, FH, FW, OH, OW, 1, 0, 0); - stop_timer(); + stop_timer(rt_cluster_id()); #ifdef CHECK_CHECKSUM errors = 0; @@ -110,7 +110,7 @@ int test_singlethread(void (*test)(int16_t *, int16_t *, int16_t *, int, int, in #endif #ifndef PULP_SPI - printf("%s, errors=%d, time=%d\n", str, errors, get_time()); + printf("%s, errors=%d, time=%d\n", str, errors, get_time(rt_cluster_id())); #endif } @@ -129,12 +129,12 @@ int test_multithread(void (*test)(int16_t *, int16_t *, int16_t *, int, int, int synch_barrier(); if(rt_core_id() == 0) { - reset_timer(); - start_timer(); + reset_timer(rt_cluster_id()); + start_timer(rt_cluster_id()); } test(g_W, g_x, g_y, IH, IW, FH, FW, OH, OW, 1, 0, 0); if(rt_core_id() == 0) { - stop_timer(); + stop_timer(rt_cluster_id()); #ifdef CHECK_CHECKSUM errors = 0; @@ -152,7 +152,7 @@ int test_multithread(void (*test)(int16_t *, int16_t *, int16_t *, int, int, int #endif #ifndef PULP_SPI - printf("%s, errors=%d, time=%d\n", str, errors, get_time()); + printf("%s, errors=%d, time=%d\n", str, errors, get_time(rt_cluster_id())); #endif } diff --git a/parallel_bare_tests/parMatrixMul/parMatrixMul.c b/parallel_bare_tests/parMatrixMul/parMatrixMul.c index 78a3239..4ddd6aa 100644 --- a/parallel_bare_tests/parMatrixMul/parMatrixMul.c +++ b/parallel_bare_tests/parMatrixMul/parMatrixMul.c @@ -57,9 +57,6 @@ testcase_t testcases[] = { int main() { - if (rt_cluster_id() != 0) - return bench_cluster_forward(0); - int nbErrors = run_suite(testcases); synch_barrier(); @@ -85,9 +82,9 @@ void matrix_multiplication(testresult_t *result, void (*start)(), void (*stop)() lb = coreid * chunk; //upper bound ub = lb + chunk; - + synch_barrier(); - + /********************* Benchmark Execution *********************/ if (coreid +#include +#include "matmul.h" + +#define N_ITERS 1 +#define max(x,y) (x > y ? x : y) +#define min(x,y) (x < y ? x : y) + +__attribute__ ((section(".heapsram"))) int A[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) int B[SIZE][SIZE]; +__attribute__ ((section(".heapsram"))) int C[SIZE][SIZE]; + +void initialize_mat(); + +void initialize_mat() { + int i,j; + + for (i=0;i +#include +#include +#include + +#define SIZE 0x400 +#define NUM_BANKS 16 +#define SCRUBBER_INTERVAL 2 + +int main() { + // Collecting info about the core ID and the running cluster ID + unsigned int core_id = get_core_id(); + unsigned int cluster_id = rt_cluster_id(); + + if (rt_cluster_id() != 0) return bench_cluster_forward(0); + + if (core_id != 0) synch_barrier(); + + unsigned int *test_array = pi_l1_malloc(cluster_id, SIZE); + + // Initializing the memory + for (int i = 0; i < SIZE; i++) { + pulp_write32(&test_array[i], i); + } + + // Initialize the scrubbing interval for all memory banks + for (int i = 0; i < NUM_BANKS; i++) + tcdm_scrubber_set_interval(cluster_id, i, SCRUBBER_INTERVAL); + + // Initialize the error-tracking variables + bool mismatch = 0; + unsigned int error = 0; + for (int i = 0; i < SIZE; i++) { + mismatch = (pulp_read32(&test_array[i]) != i); + if (mismatch) { + error ++; + printf("Expected 0x%x, got 0x%x\n", i, pulp_read32(&test_array[i])); + } + } + + unsigned int mismatch_cnt = 0; + unsigned int fix_cnt = 0; + unsigned int uncorrectable_cnt = 0; + for (int i = 0; i < 16; i++) { + mismatch_cnt += tcdm_scrubber_get_mismatch_count(cluster_id, i); + fix_cnt += tcdm_scrubber_get_fix_count(cluster_id, i); + uncorrectable_cnt += tcdm_scrubber_get_uncorrectable_count(cluster_id, i); + } + + printf("mismatch_cnt: %d, fix_cnt: %d, uncorrectable_cnt: %d\n", mismatch_cnt, fix_cnt, uncorrectable_cnt); + + return (error != 0) && (uncorrectable_cnt == 0); +} diff --git a/reliability_tests/ecc_test/pulp_inject_fault.tcl b/reliability_tests/ecc_test/pulp_inject_fault.tcl new file mode 100644 index 0000000..45cef47 --- /dev/null +++ b/reliability_tests/ecc_test/pulp_inject_fault.tcl @@ -0,0 +1,53 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 +# +# Author: Michael Rogenmoser (michaero@iis.ee.ethz.ch) + +transcript quietly +if {! [info exists ::env(VSIM_PATH)]} {error "Define VSIM_PATH"} +set utils_base_path [file join $::env(VSIM_PATH) scripts fault_injection_utils] +set script_base_path [file join $::env(VSIM_PATH) fault_injection_sim scripts] + +set verbosity 2 +set log_injections 1 +# Easy way to generate a variable seed +# set seed [clock seconds] +# Default value +set seed 12345 +set print_statistics 1 + +set inject_start_time 110584000000ps +set inject_stop_time 203880000000ps +set injection_clock "pulp_cluster_tb/cluster_i/clk_i" +set injection_clock_trigger 0 +set fault_period 100 +set rand_initial_injection_phase 0 +# max_num set to 0 means until stop_time +set max_num_fault_inject 0 +set signal_fault_duration 20ns +set register_fault_duration 0ns + +set allow_multi_bit_upset $::env(MULTI_BIT_UPSET) +set use_bitwidth_as_weight 0 +set check_core_output_modification 0 +set check_core_next_state_modification 0 +set reg_to_sig_ratio 1 + +source [file join $utils_base_path pulp_extract_nets.tcl] + +set inject_signals_netlist [] +set inject_register_netlist [] +set output_netlist [] +set next_state_netlist [] +set assertion_disable_list [] + +# for {set idx 0} {$idx < 12} {incr idx} { +# set inject_signals_netlist [list {*}$inject_signals_netlist {*}[get_all_core_nets $idx]] +# set output_netlist [list {*}$output_netlist {*}[get_core_output_nets $idx]] +# } + +set inject_register_netlist [list {*}$inject_register_netlist {*}[get_memory_slice {0 16} {385 449}]] + +source [file join $script_base_path inject_fault.tcl] + diff --git a/reliability_tests/icache_fi_conv16/Makefile b/reliability_tests/icache_fi_conv16/Makefile new file mode 100644 index 0000000..58620aa --- /dev/null +++ b/reliability_tests/icache_fi_conv16/Makefile @@ -0,0 +1,11 @@ +PULP_APP = test +PULP_APP_SRCS = icache_conv16.c + +PULP_CFLAGS = -O3 + +ifeq ($(fault_inject),1) + export FAULT_INJECTION=1 + export FAULT_INJECTION_SCRIPT=$(CURDIR)/pulp_inject_fault.tcl +endif + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/reliability_tests/icache_fi_conv16/conv16.h b/reliability_tests/icache_fi_conv16/conv16.h new file mode 100644 index 0000000..b75c587 --- /dev/null +++ b/reliability_tests/icache_fi_conv16/conv16.h @@ -0,0 +1,397 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Mantainer: Luca Valente, luca.valente2@unibo.it + */ +/****************************************************************************** + * * + * Multitherman Lab @ DEI - University of Bologna * + * Viale Risorgimento 2 40136 * + * Bologna - phone 0512092759 * + * * + * Engineer: Francesco Conti - f.conti@unibo.it * + * * + * Project: CConvNet * + * File: conv16.h * + * Description: 16-bit fixed point convolution test * + * * + ******************************************************************************/ + +#ifndef _CONV16_H +#define _CONV16_H + +// uncomment if doing test with SPI (disables printf's) +// #define PULP_SPI + +// fractionary bits +#define QF 13 + +// uncomment if checking errors in detail +#define CHECK_ERROR + +// uncomment if performing checks at all +#define CHECK_CHECKSUM + +#define IMPRECISE_ASM5 + +#define FIXED_MUL(a,b) ((a*b) >> QF); + +#define IH 32 +#define IW 32 +#define FH 5 +#define FW 5 +#define OH (IH-FH+1) +#define OW (IW-FW+1) + +// right checksum +#define RIGHT_CHECKSUM 0x009da8b0 + +#define FIXED_MUL_ASM(W_ptr,x_ptr,conv) \ +__asm__ volatile \ +( \ + "l.lhs r28,0x0(%0); " \ + "l.addi %0,%0,0x4; " \ + "l.addi %1,%1,0xFFFC; " \ + "l.mul r28,r28,r29; " \ + "l.srli r28,r28,0xD; " \ + "l.add %2,%2,r28; " \ + : "=r"(W_ptr), "=r"(x_ptr), "=&r"(conv) \ + : "r"(W_ptr), "r"(x_ptr) \ + : "r28", "r29", "cc" \ +) + +#define FIXED_MUL_ASM5_IMPRECISE(W_ptr,x_ptr,conv) \ +__asm__ volatile \ +( \ + "l.ori r25,r0,0xffff; " \ + "l.ori r26,r0,16; " \ + "l.andi r27,r27,0x0000; " \ + "l.lwz r28,0x0000(%0); " \ + "l.lwz r29,0x0000(%1); " \ + "l.ror r30,r28,r26; " \ + "l.ror r31,r29,r26; " \ + "l.and r30,r30,r25; " \ + "l.and r31,r31,r25; " \ + "l.mul r30,r30,r31; " \ + "l.add r27,r27,r30; " \ + "l.and r28,r28,r25; " \ + "l.and r29,r29,r25; " \ + "l.mul r28,r28,r29; " \ + "l.add r27,r27,r28; " \ + "l.lwz r28,0x0004(%0); " \ + "l.lwz r29,0xfffc(%1); " \ + "l.ror r30,r28,r26; " \ + "l.ror r31,r29,r26; " \ + "l.and r30,r30,r25; " \ + "l.and r31,r31,r25; " \ + "l.mul r30,r30,r31; " \ + "l.add r27,r27,r30; " \ + "l.and r28,r28,r25; " \ + "l.and r29,r29,r25; " \ + "l.mul r28,r28,r29; " \ + "l.add r27,r27,r28; " \ + "l.lwz r28,0x0008(%0); " \ + "l.lwz r29,0xfff8(%1); " \ + "l.ror r31,r29,r26; " \ + "l.and r28,r28,r25; " \ + "l.and r31,r31,r25; " \ + "l.mul r28,r28,r31; " \ + "l.add r27,r27,r28; " \ + "l.srli r27,r27,0xD; " \ + "l.add %2,%2,r27; " \ + : "=r"(W_ptr), "=r"(x_ptr), "=&r"(conv) \ + : "r"(W_ptr), "r"(x_ptr), "r"(conv) \ + : "r27", "r28", "r29", "cc" \ +) + +#define FIXED_MUL_ASM5_PRECISE(W_ptr,x_ptr,conv) \ +__asm__ volatile \ +( \ + "l.lhs r28,0x0000(%0); " \ + "l.lhs r29,0x0000(%1); " \ + "l.mul r28,r28,r29; " \ + "l.srli r28,r28,0xD; " \ + "l.add %2,%2,r28; " \ + "l.lhs r28,0x0002(%0); " \ + "l.lhs r29,0xfffe(%1); " \ + "l.mul r28,r28,r29; " \ + "l.srli r28,r28,0xD; " \ + "l.add %2,%2,r28; " \ + "l.lhs r28,0x0004(%0); " \ + "l.lhs r29,0xfffc(%1); " \ + "l.mul r28,r28,r29; " \ + "l.srli r28,r28,0xD; " \ + "l.add %2,%2,r28; " \ + "l.lhs r28,0x0006(%0); " \ + "l.lhs r29,0xfffa(%1); " \ + "l.mul r28,r28,r29; " \ + "l.srli r28,r28,0xD; " \ + "l.add %2,%2,r28; " \ + "l.lhs r28,0x0008(%0); " \ + "l.lhs r29,0xfff8(%1); " \ + "l.mul r28,r28,r29; " \ + "l.srli r28,r28,0xD; " \ + "l.add %2,%2,r28; " \ + : "=r"(W_ptr), "=r"(x_ptr), "=&r"(conv) \ + : "r"(W_ptr), "r"(x_ptr) \ + : "r27", "r28", "r29", "cc" \ +) + +// #define FIXED_MUL_ASM25_PRECISE(W_ptr,x_ptr,conv) \ +// __asm__ volatile \ +// ( \ +// "l.lwz r28,0x0000(%0); " \ +// "l.lwz r29,0x0000(%1); " \ +// "l.mul r28,r28,r29; " \ +// "l.srli r28,r28,0xD; " \ +// "l.add %2,%2,r28; " \ +// "l.lwz r28,0x0004(%0); " \ +// "l.lwz r29,0xfffc(%1); " \ +// "l.mul r28,r28,r29; " \ +// "l.srli r28,r28,0xD; " \ +// "l.add %2,%2,r28; " \ +// "l.lwz r28,0x0008(%0); " \ +// "l.lwz r29,0xfff8(%1); " \ +// "l.mul r28,r28,r29; " \ +// "l.srli r28,r28,0xD; " \ +// "l.add %2,%2,r28; " \ +// "l.lwz r28,0x000c(%0); " \ +// "l.lwz r29,0xfff4(%1); " \ +// "l.mul r28,r28,r29; " \ +// "l.srli r28,r28,0xD; " \ +// "l.add %2,%2,r28; " \ +// "l.lwz r28,0x0010(%0); " \ +// "l.lwz r29,0xfff0(%1); " \ +// "l.mul r28,r28,r29; " \ +// "l.srli r28,r28,0xD; " \ +// "l.add %2,%2,r28; " \ +// \ +// "l.slli %1,r28,0x2; " \ +// \ +// "l.lwz r28,0x0014(%0); " \ +// "l.lwz r29,0x0000(%1); " \ +// "l.mul r28,r28,r29; " \ +// "l.srli r28,r28,0xD; " \ +// "l.add %2,%2,r28; " \ +// "l.lwz r28,0x0018(%0); " \ +// "l.lwz r29,0xfffc(%1); " \ +// "l.mul r28,r28,r29; " \ +// "l.srli r28,r28,0xD; " \ +// "l.add %2,%2,r28; " \ +// "l.lwz r28,0x001c(%0); " \ +// "l.lwz r29,0xfff8(%1); " \ +// "l.mul r28,r28,r29; " \ +// "l.srli r28,r28,0xD; " \ +// "l.add %2,%2,r28; " \ +// "l.lwz r28,0x0020(%0); " \ +// "l.lwz r29,0xfff4(%1); " \ +// "l.mul r28,r28,r29; " \ +// "l.srli r28,r28,0xD; " \ +// "l.add %2,%2,r28; " \ +// "l.lwz r28,0x0024(%0); " \ +// "l.lwz r29,0xfff0(%1); " \ +// "l.mul r28,r28,r29; " \ +// "l.srli r28,r28,0xD; " \ +// "l.add %2,%2,r28; " \ +// // WIP +// : "=r"(W_ptr), "=r"(x_ptr), "=&r"(conv) \ +// : "r"(W_ptr), "r"(x_ptr), "r"(w), "r"(fw) \ +// : "r27", "r28", "r29", "cc" \ +// ) + +#define XPTR_UPDATE(x_ptr,w,fw) \ +__asm__ volatile \ +( \ + "l.sub r28,%1,%2; " \ + "l.slli r28,r28,0x2; " \ + "l.sub %0,%0,r28; " \ + : "=&r"(x_ptr) \ + : "r"(x_ptr), "r"(w), "r"(fw) \ + : "r28", "cc" \ +) + +int test_singlethread(void (*test)(int16_t *, int16_t *, int16_t *, int, int, int, int, int, int, int, int, int), char *str); +int test_multithread(void (*test)(int16_t *, int16_t *, int16_t *, int, int, int, int, int, int, int, int, int), char *str); + +void conv16_gold(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +void conv16_asm_mul(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +// void conv16_asm_mac(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +void conv16_asm_mul_unrolled_5x5(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +void conv16_unrolled_5x5(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +void conv16_unrolled_ptr_5x5(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +// void conv16_gold_four_finest(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b, int32_t *shared_conv); +// void conv16_gold_four_fine(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +void conv16_gold_four_coarse(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +void conv16_unrolled_5x5_four_coarse(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +void conv16_unrolled_ptr_5x5_four_coarse(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +void conv16_gold_four_coarsest(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +void conv16_unrolled_5x5_four_coarsest(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +void conv16_unrolled_ptr_5x5_four_coarsest(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); +void conv16_asm_mul_unrolled_5x5_four_coarsest(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b); + +void load(); +int check(int16_t *y); +int checksum(int16_t *y); + +int16_t correct_yout[OH*OW] = { + 0x0352, + 0x036c, + 0x0386, + 0x03a0, + 0x03ba, + 0x03d4, + 0x03ee, + 0x0408, + 0x0422, + 0x043c, + 0x0456, + 0x0470, + 0x04ee, + 0x0508, + 0x0522, + 0x053c, + 0x0556, + 0x0570, + 0x058a, + 0x05a4, + 0x05be, + 0x05d8, + 0x05f2, + 0x060c, + 0x068a, + 0x06a4, + 0x06be, + 0x06d8, + 0x06f2, + 0x070c, + 0x0726, + 0x0740, + 0x075a, + 0x0774, + 0x078e, + 0x07a8, + 0x0826, + 0x0840, + 0x085a, + 0x0874, + 0x088e, + 0x08a8, + 0x08c2, + 0x08dc, + 0x08f6, + 0x0910, + 0x092a, + 0x0944, + 0x09c2, + 0x09dc, + 0x09f6, + 0x0a10, + 0x0a2a, + 0x0a44, + 0x0a5e, + 0x0a78, + 0x0a92, + 0x0aac, + 0x0ac6, + 0x0ae0, + 0x0b5e, + 0x0b78, + 0x0b92, + 0x0bac, + 0x0bc6, + 0x0be0, + 0x0bfa, + 0x0c14, + 0x0c2e, + 0x0c48, + 0x0c62, + 0x0c7c, + 0x0cfa, + 0x0d14, + 0x0d2e, + 0x0d48, + 0x0d62, + 0x0d7c, + 0x0d96, + 0x0db0, + 0x0dca, + 0x0de4, + 0x0dfe, + 0x0e18, + 0x0e96, + 0x0eb0, + 0x0eca, + 0x0ee4, + 0x0efe, + 0x0f18, + 0x0f32, + 0x0f4c, + 0x0f66, + 0x0f80, + 0x0f9a, + 0x0fb4, + 0x1032, + 0x104c, + 0x1066, + 0x1080, + 0x109a, + 0x10b4, + 0x10ce, + 0x10e8, + 0x1102, + 0x111c, + 0x1136, + 0x1150, + 0x11ce, + 0x11e8, + 0x1202, + 0x121c, + 0x1236, + 0x1250, + 0x126a, + 0x1284, + 0x129e, + 0x12b8, + 0x12d2, + 0x12ec, + 0x136a, + 0x1384, + 0x139e, + 0x13b8, + 0x13d2, + 0x13ec, + 0x1406, + 0x1420, + 0x143a, + 0x1454, + 0x146e, + 0x1488, + 0x1506, + 0x1520, + 0x153a, + 0x1554, + 0x156e, + 0x1588, + 0x15a2, + 0x15bc, + 0x15d6, + 0x15f0, + 0x160a, + 0x1624, +}; + +#endif diff --git a/reliability_tests/icache_fi_conv16/icache_conv16.c b/reliability_tests/icache_fi_conv16/icache_conv16.c new file mode 100644 index 0000000..e62c440 --- /dev/null +++ b/reliability_tests/icache_fi_conv16/icache_conv16.c @@ -0,0 +1,935 @@ +/* + * Copyright (C) 2018 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Mantainer: Luca Valente, luca.valente2@unibo.it + */ +/****************************************************************************** + * * + * Multitherman Lab @ DEI - University of Bologna * + * Viale Risorgimento 2 40136 * + * Bologna - phone 0512092759 * + * * + * Engineer: Francesco Conti - f.conti@unibo.it * + * * + * Project: CConvNet * + * File: conv16.c * + * Description: 16-bit fixed point convolution test * + * * + ******************************************************************************/ + +#include +#include +#include "conv16.h" + +__attribute__((section(".heapsram"))) int16_t g_W[FH*FW]; +__attribute__((section(".heapsram"))) int16_t g_x[IH*IW]; +__attribute__((section(".heapsram"))) int16_t g_y[OH*OW]; +__attribute__((section(".heapsram"))) int16_t g_y_in[OH*OW]; + +int main() { + + if (rt_cluster_id() != 0) + return bench_cluster_forward(0); + + int errors = 0; + int sum = 0; + + // single-threaded "golden" by-the-book convolution + errors += test_singlethread(&conv16_gold, "sequential convolution"); + + // single-threaded loop-unrolled convolution + errors += test_singlethread(&conv16_unrolled_5x5, "sequential loop-unrolled convolution"); + + // single-threaded loop-unrolled pointer-optimized convolution + errors += test_singlethread(&conv16_unrolled_ptr_5x5, "sequential loop-unrolled pointer-optimized convolution"); + + // multi-threaded by-the-book convolution (1 thread per output pixel) + errors += test_multithread(&conv16_gold_four_coarse, "4-threaded convolution (1 thread per output pixel)"); + + // multi-threaded loop-unrolled convolution (1 thread per output pixel) + errors += test_multithread(&conv16_unrolled_5x5_four_coarse, "4-threaded loop-unrolled convolution (1 thread per output pixel)"); + + // multi-threaded loop-unrolled pointer-optimized convolution (1 thread per output pixel) + errors += test_multithread(&conv16_unrolled_ptr_5x5_four_coarse, "4-threaded loop-unrolled pointer-optimized convolution (1 thread per output pixel)"); + + // multi-threaded by-the-book convolution (1 thread per output row) + errors += test_multithread(&conv16_gold_four_coarsest, "4-threaded convolution (1 thread per output row)"); + + // multi-threaded loop-unrolled convolution (1 thread per output row) + errors += test_multithread(&conv16_unrolled_5x5_four_coarsest, "4-threaded loop-unrolled convolution (1 thread per output row)"); + + // multi-threaded loop-unrolled pointer-optimized convolution (1 thread per output row) + errors += test_multithread(&conv16_unrolled_ptr_5x5_four_coarsest, "4-threaded loop-unrolled pointer-optimized convolution (1 thread per output row)"); + + synch_barrier(); + + // TODO readout icache errors + + return errors; +} + +int test_singlethread(void (*test)(int16_t *, int16_t *, int16_t *, int, int, int, int, int, int, int, int, int), char *str) { + int errors = 0; + int sum = 0; + + synch_barrier(); + + if(rt_core_id() == 0) { + load(); + + reset_timer(rt_cluster_id()); + start_timer(rt_cluster_id()); + test(g_W, g_x, g_y, IH, IW, FH, FW, OH, OW, 1, 0, 0); + stop_timer(rt_cluster_id()); + + #ifdef CHECK_CHECKSUM + errors = 0; + sum = checksum(g_y); + if(sum != RIGHT_CHECKSUM) { + #ifndef PULP_SPI + printf("wrong checksum, 0x%08x instead of 0x%08x\n", sum, RIGHT_CHECKSUM); + #ifndef CHECK_ERROR + errors += 1; + #endif + #endif + #ifdef CHECK_ERROR + errors = check(g_y); + #endif + } + #else + errors = -1; + #endif + + #ifndef PULP_SPI + printf("%s, errors=%d, time=%d\n", str, errors, get_time(rt_cluster_id())); + #endif + + } + + return errors; +} + +int test_multithread(void (*test)(int16_t *, int16_t *, int16_t *, int, int, int, int, int, int, int, int, int), char *str) { + int errors = 0; + int sum = 0; + + if(rt_core_id() == 0) { + load(); + } + + synch_barrier(); + + if(rt_core_id() == 0) { + reset_timer(rt_cluster_id()); + start_timer(rt_cluster_id()); + } + test(g_W, g_x, g_y, IH, IW, FH, FW, OH, OW, 1, 0, 0); + if(rt_core_id() == 0) { + stop_timer(rt_cluster_id()); + + #ifdef CHECK_CHECKSUM + errors = 0; + sum = checksum(g_y); + if(sum != RIGHT_CHECKSUM) { + #ifndef PULP_SPI + printf("wrong checksum, 0x%08x instead of 0x%08x\n", sum, RIGHT_CHECKSUM); + #ifndef CHECK_ERROR + errors += 1; + #endif + #endif + #ifdef CHECK_ERROR + errors = check(g_y); + #endif + } + #else + errors = -1; + #endif + + #ifndef PULP_SPI + printf("%s, errors=%d, time=%d\n", str, errors, get_time(rt_cluster_id())); + #endif + + } + + return errors; +} + +void conv16_gold(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b) { + int i; + for (i=0; i> QF)); + #endif + } + } +} + +void conv16_unrolled_5x5(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b) { + int i; + for (i=0; i> QF); + + #endif + + } + } +} + +void conv16_unrolled_ptr_5x5(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b) { + register int i; + register int j; + + int16_t *y_ptr = y + a*oh*ow; + int16_t *x_base = x + b*h*w + (fh-1)*w + (fw-1); + int16_t *W_base = &W[((a*nif)+b)*fh*fw]; + int16_t *W_ptr; + int16_t *x_ptr; + + for (i=0; i> QF; + + #endif + + } + } + +} + +void conv16_gold_four_coarse(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b) { + int i; + register int myid = rt_core_id(); + register int num_cores = get_core_num(); + // synch_barrier(); + for (i=0; i> QF)); + #endif + } + } + synch_barrier(); +} + +void conv16_gold_four_coarsest(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b) { + int i; + register int myid = rt_core_id(); + register int num_cores = get_core_num(); + // synch_barrier(); + for (i=myid; i> QF)); + #endif + } + } + synch_barrier(); +} + +void conv16_unrolled_5x5_four_coarse(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b) { + int i; + register int myid = rt_core_id(); + register int num_cores = get_core_num(); + // synch_barrier(); + for (i=0; i> QF); // because i'm using 32-bit int + + #endif + + } + } + synch_barrier(); +} + +void conv16_unrolled_5x5_four_coarsest(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b) { + int i; + register int myid = rt_core_id(); + register int num_cores = get_core_num(); + // synch_barrier(); + for (i=myid; i> QF); // because i'm using 32-bit int + + #endif + + } + } + synch_barrier(); +} + +void conv16_unrolled_ptr_5x5_four_coarse(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b) { + register int i; + register int j; + register int myid = rt_core_id(); + register int num_cores = get_core_num(); + int16_t *W_ptr; + int16_t *x_ptr; + int16_t *y_ptr = y + a*oh*ow; + int16_t *x_base = x + b*h*w + (fh-1)*w + (fw-1); + int16_t *W_base = &W[((a*nif)+b)*fh*fw]; + + // synch_barrier(); + + for (i=0; i> QF; + + #endif + + } + } + + synch_barrier(); +} + +void conv16_unrolled_ptr_5x5_four_coarsest(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b) { + // register int i; + // register int j; + // register int myid = rt_core_id(); + // int16_t *W_ptr; + // int16_t *x_ptr; + int16_t *y_ptr = y + a*oh*ow; + int16_t *x_base = x + b*h*w + (fh-1)*w + (fw-1); + int16_t *W_base = &W[((a*nif)+b)*fh*fw]; + + register int myid = rt_core_id(); + register int num_cores = get_core_num(); + + register int i; + register int j; + int16_t *W_ptr; + int16_t *x_ptr; + + for (i=myid; i> QF; + + #endif + + } + } + + synch_barrier(); + +} + +#ifndef __GCC__ +#ifndef __riscv__ +void conv16_asm_mul_unrolled_5x5_four_coarsest(int16_t *__restrict__ W, int16_t *__restrict__ x, int16_t *__restrict__ y, int h, int w, int fh, int fw, int oh, int ow, int nif, int a, int b) { + register int i; + register int j; + register int myid = rt_core_id(); + register int num_cores = get_core_num(); + register int16_t *W_ptr; + register int16_t *x_ptr; + int16_t *y_ptr = y + a*oh*ow; + int16_t *x_base = x + b*h*w + (fh-1)*w + (fw-1); + int16_t *W_base = &W[((a*nif)+b)*fh*fw]; + for (i=myid; i + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN_KERNELS__ +#define __XPULPNN_KERNELS__ + +void xpulp_nn_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif + + +void xpulp_nn_conv_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles); +#endif + +uint8_t *xpulp_nn_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_maxpool_u8( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i8( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u4( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i4( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u2( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i2( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif diff --git a/rt_nn_tests/xpnn_conv/pulp_nn_mix_kernels.h b/rt_nn_tests/xpnn_conv/pulp_nn_mix_kernels.h new file mode 100644 index 0000000..8b2a3c6 --- /dev/null +++ b/rt_nn_tests/xpnn_conv/pulp_nn_mix_kernels.h @@ -0,0 +1,7093 @@ +/* + * pulp_nn_kernels.h + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN-MIXED_KERNELS__ +#define __XPULPNN-MIXED_KERNELS__ + +void xpulp_nn_mix_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif \ No newline at end of file diff --git a/rt_nn_tests/xpnn_conv/pulp_nn_utils.h b/rt_nn_tests/xpnn_conv/pulp_nn_utils.h new file mode 100644 index 0000000..44d2251 --- /dev/null +++ b/rt_nn_tests/xpnn_conv/pulp_nn_utils.h @@ -0,0 +1,2079 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include + +typedef signed short v2s __attribute__((vector_size (4))); + + + +#define min(a,b) ((a)<(b)?(a):(b)) +#define log2(x) __builtin_pulp_fl1(x) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define CHANS_DECOMPR(x) (5*x >> 2) // equivalent to division by 0.8 + +/* Functions for Compressed MAC */ +#define CompressedMAC(sum, ptr, config) asm volatile( \ + "pv.smlsdotsp.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define CompressedMACUnsigned(sum, ptr, config) asm volatile( \ + "pv.smlsdotsup.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define InitNNRF(ptr, config) asm volatile( \ + "pv.smlsdotsp.t x0, %[phtr], %[chonfig];" \ + : [phtr] "+r" (ptr) : [chonfig] "I" (config)) + +#define ThresholdCompress(res, val, thrs) asm volatile( \ + "pv.thrc %[rhes], %[vhal], %[thhrs];" : [rhes] "+r" (res) : [vhal] "r" (val), [thhrs] "r" (thrs)) + +#define GetConfig(a_update, b_update, a_reg, b_reg) a_update << 4 | b_update << 3 | a_reg << 1 | b_reg + +/* Functions for threshold&compress */ +#define check_store(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; \ + incr_val=ch_out_r; } + +#define check_store_4x1(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; } + +#define reset_currThr() \ + if ((uint32_t *) currThr == (uint32_t *) (pThr + ch_out)) { \ + currThr = (v2s *) pThr; \ + } + +#define MacLoads20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define MacLoad20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +/* Functions for compressed min/max */ +#define CompressedMax(res, in1, in2) asm volatile( \ + "pv.max.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define CompressedMin(res, in1, in2) asm volatile( \ + "pv.min.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Max16(res, in1, in2) asm volatile( \ + "pv.max.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Min16(res, in1, in2) asm volatile( \ + "pv.min.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define thr_cmp(state, val, threshs) __builtin_pulp_thresh_compr(state, val, threshs) + +typedef unsigned char v4u __attribute__((vector_size (4))); +typedef signed char v4s __attribute__((vector_size (4))); + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#ifdef __clang__ +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_binsert(dst,not_mask_imm,src,mask_imm,off) +#else +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#endif +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define maxs20(a, b) __builtin_pulp_max20(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define mins20(a, b) __builtin_pulp_min20(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_tnn_zero_mem_ternary(uint8_t * pBuffer, unsigned int size, unsigned int uns) +{ + uint8_t pad_val = 0xd9; + uint32_t pad_vec = 0xd9d9d9d9; + if (uns) { + // if we are using an unsigned kernel, we need to pad with -1 because the hardware will add a +1 to ALL values! + pad_val = 0xff; + pad_vec = 0xffffffff; + } + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u)pad_vec; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=pad_val; + lfover-=4; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_tnn_compare_and_replace_if_larger_ternary(int8_t * base, + int8_t * target, + uint16_t length) +{ + uint8_t mask2 = 0x0c; + uint8_t n_mask2 = ~ mask2; + uint8_t mask4 = 0x30; + uint8_t n_mask4 = ~ mask4; + uint8_t mask6 = 0xc0; + uint8_t n_mask6 = ~ mask6; + uint8_t off2 = 2; + uint8_t off4 = 4; + uint8_t off6 = 6; + + uint8_t *pIn = (uint8_t *) base; + uint8_t *pCom = (uint8_t *) target; + uint8_t *out; + + int cnt = length >> 2; + uint32_t result; + + while(cnt > 0u) + { + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + *((uint32_t *)pIn) = result; + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + if (left>0u) + { + // do the vector max on the whole word - we won't use the leftover bytes + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + + // ...and copy back the relevant bytes of the result to pIn + for (int i=0; i> (8*i)); + + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +#endif diff --git a/rt_nn_tests/xpnn_conv/pulp_nn_utils_xpnn.h b/rt_nn_tests/xpnn_conv/pulp_nn_utils_xpnn.h new file mode 100644 index 0000000..0c783ae --- /dev/null +++ b/rt_nn_tests/xpnn_conv/pulp_nn_utils_xpnn.h @@ -0,0 +1,1937 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include "pmsis.h" +#ifdef GAP_SDK +#include "pulp.h" +#endif + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +#endif diff --git a/rt_nn_tests/xpnn_conv/test.c b/rt_nn_tests/xpnn_conv/test.c new file mode 100644 index 0000000..5ce4bde --- /dev/null +++ b/rt_nn_tests/xpnn_conv/test.c @@ -0,0 +1,221 @@ +#include +#include +#include + + +#include "pmsis.h" + +#include "data_statstest.h" +//#include "pulp_nn_kernels.h" +#include "pulp_nn_mix_kernels.h" + + +#define start_cycle_counter() asm volatile("csrw 0xCC0, 0x01;") +#define stop_cycle_counter() asm volatile("csrw 0xCC0, 0x00;") +#define read_cycle_counter(x) asm volatile("csrr %0, 0x780;" : "=r" (x)) +#define reset_cycle_counter() asm volatile("csrw 0x780, 0x0;") + +uint8_t im2col[IM2COL_DIM] = {0}; +uint8_t outputs[OUTPUT_DIM] = {0}; + +int32_t outputs_fp[OUTPUT_DIM_FP] = {0}; + +#ifndef PROFILE +int num_cycles; +#else +int im2col_cycles; +int hotloop_prep_cycles; +int hotloop_cycles; +int threshold_cycles; +int requant_cycles; +int hotloop_leftover_cycles; +int matmul4x2_leftover_cycles; +#endif + +void call_krnl_0(); +void test_0(); + +int main(int argc, char *argv[]) +{ +#if KRAKEN_PTEST == 1 + kraken_padframe_aon_pad_gpioa_cfg_rxe_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_trie_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); +#endif + int n_mismatches = 0; + int mismatches_tot = 0; + + + + #ifndef PROFILE + num_cycles = 0; + #endif + + if (get_core_id() == 0) { + printf("===> TEST 0: Running xpulp_nn_conv_u4_u8_i2...\n"); + printf(" dims_in = [4, 4]\n"); + printf(" dims_kernel = [3, 3]\n"); + printf(" ch_in/out = [16, 32]\n"); + //printf(" padding_y_top = [%d]\n", padding_y_top); + //printf(" padding_y_bottom = [%d]\n", padding_y_bottom); + //printf(" padding_x_left = [%d]\n", padding_x_left); + //printf(" padding_x_right = [%d]\n", padding_x_right); + //printf(" stride_x = [%d]\n", stride_x); + //printf(" stride_y = [%d]\n", stride_y); + } + test_0(); + #ifndef PROFILE + stop_cycle_counter(); + read_cycle_counter(num_cycles); + if (get_core_id() == 0) { + printf("===> TEST 0: Finished running xpulp_nn_conv_u4_u8_i2\n"); + printf("num_cycles = %d\n", num_cycles); + printf("MACs = 73728\n"); + printf("MACs/cycle = %.4f\n", 73728/num_cycles); + } + #endif + if (get_core_id() == 0) { + printf("Checking for mismatches..\n"); + n_mismatches = 0; + + for(int i=0; i < 512; i++) { + if (outputs[i] != exp_outp_0[i]){ + printf("***Mismatch in test 0 at iteration %d: Expected: %x, got: %x\n", i, exp_outp_0[i], outputs[i]); + n_mismatches++; + } + } + } + mismatches_tot += n_mismatches; + + if (get_core_id() == 0) { + printf("Got %d mismatches in %d tests\n", mismatches_tot, 1); + } + return mismatches_tot; +} + + +void call_krnl_0(void) { + uint8_t * pInp; + uint8_t * pIm2ColBuffer; + int8_t * pBias = NULL; + uint8_t * pOut; + int8_t * pWeight; + uint32_t * pThr; + int32_t * pKappa, pLambda; + #ifdef PROFILE + int32_t im2col_cycles = 0; + int32_t hotloop_prep_cycles = 0; + int32_t hotloop_cycles = 0; + int32_t threshold_cycles = 0; + int32_t requant_cycles = 0; + int32_t hotloop_leftover_cycles = 0; + int32_t matmul4x2_leftover_cycles = 0; + #endif + pInp = inp_l1; + + pOut = outp_l1; + pIm2ColBuffer = im2col_l1; + pWeight = wt_l1; + pThr = threshs_l1; + pKappa = kappa_l1; + pLambda = lambda_l1; +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 1); + } +#endif + xpulp_nn_conv_u4_u8_i2( + pInp, + pIm2ColBuffer, + pBias, + pOut, + pWeight, + pKappa, + pLambda, + 1, + 13, + 4, + 4, + 16, + 4, + 4, + 32, + 3, + 3, + 1, + 1, + 1, + 1, + 1, + 1, + 1, +#ifndef PROFILE + 1 +#else + 1, + &im2col_cycles, + &hotloop_prep_cycles, + &hotloop_cycles, + &requant_cycles, + &hotloop_leftover_cycles, + &matmul4x2_leftover_cycles +#endif + ); + +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); + } +#endif + #ifdef PROFILE + if (pi_core_id() == 0) { + printf("im2col_cycles = %d\n", im2col_cycles); + printf("hotloop_prep_cycles = %d\n", hotloop_prep_cycles); + printf("hotloop_cycles = %d\n", hotloop_cycles); + printf("requant_cycles = %d\n", requant_cycles); + printf("threshold_cycles = %d\n", threshold_cycles); + printf("hotloop_leftover_cycles = %d\n", hotloop_leftover_cycles); + printf("matmul4x2_leftover_cycles = %d\n", matmul4x2_leftover_cycles); + } + #endif +} + +void test_0(void) { + // DMA transfer inputs from L2 to L1 + if (pi_core_id() == 0) { + plp_dma_memcpy(pIn_0, inp_l1, 128, 1); + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + if (pi_core_id() == 0) { + plp_dma_memcpy(pLambda_0, lambda_l1, 32 * 4, 1); // 4 bytes per lambda item + plp_dma_barrier(); + } + if (pi_core_id() == 0) { + plp_dma_memcpy(pKappa_0, kappa_l1, 32 * 4, 1); // 4 bytes per lambda item + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + // transfer weights + if (pi_core_id() == 0) { + plp_dma_memcpy(pWeight_0, wt_l1, 1152, 1); + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + call_krnl_0(); + // get outputs back with DMA + if (pi_core_id() == 0) { + plp_dma_memcpy(outputs, outp_l1, 512, 0); + plp_dma_barrier(); + } +} + + + + + + + + + + diff --git a/rt_nn_tests/xpnn_conv/xpulp_nn_conv_u4_u8_i2.c b/rt_nn_tests/xpnn_conv/xpulp_nn_conv_u4_u8_i2.c new file mode 100644 index 0000000..f625bfc --- /dev/null +++ b/rt_nn_tests/xpnn_conv/xpulp_nn_conv_u4_u8_i2.c @@ -0,0 +1,303 @@ +/* + * xpulp_nn_conv_u4_u8_i2.c + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" +#include "pulp_nn_utils.h" +#include "pulp_nn_kernels.h" + + + + +void __attribute__((noinline)) xpulp_nn_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mult, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batch_norm) +{ + uint16_t ch_in_r = PACK_INT4_SIZE(ch_in); + uint16_t ch_out_r = PACK_INT8_SIZE(ch_out); + + int core_id = pi_core_id(); + uint8_t * pIm2ColBase = pIm2ColBuffer + (2 * core_id * PACK_INT4_SIZE(ch_in) * dim_kernel_x * dim_kernel_y); + int i_out_y, i_out_x, i_ker_y, i_ker_x; + int Log2Core; + + uint8_t extra_chunk = ((dim_out_y & (NUM_CORES-1)) != 0); + uint8_t extra_chunk_r; + uint16_t dim_out_x_r; + uint8_t section; + int core_id_r; + + if(extra_chunk && dim_out_x > 1) + { + Log2Core = log2(NUM_CORES >> 1); + core_id_r = (core_id >> 1); + dim_out_x_r = (dim_out_x >> 1); + section = (core_id & 0x1); + extra_chunk_r = ((dim_out_y & ((NUM_CORES >> 1) - 1)) != 0); + } + else + { + Log2Core = log2(NUM_CORES); + core_id_r = core_id; + dim_out_x_r = dim_out_x; + section = 0; + extra_chunk_r = extra_chunk; + extra_chunk = 0; + } + + uint8_t flag_dim_out_x_odd = dim_out_x & 0x01; + + int chunk = (dim_out_y >> Log2Core) + extra_chunk_r; + + int start_pixel = min((chunk * core_id_r), dim_out_y); + int stop_pixel = min(start_pixel + chunk, dim_out_y); + + uint8_t *pIm2Col = pIm2ColBase; + uint8_t *pOutBuffer = pOut + (start_pixel * ch_out_r * dim_out_x) + (section * ch_out_r * dim_out_x_r); + + for (i_out_y = start_pixel; i_out_y < stop_pixel; i_out_y++) + { + for(i_out_x=(section * dim_out_x_r); i_out_x<(dim_out_x_r + (section * (dim_out_x_r + flag_dim_out_x_odd))); i_out_x++) + { + if(i_out_y < padding_y_top) + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x=((i_out_x * stride_x) - padding_x_left); i_ker_x<((i_out_x * stride_x) - padding_x_left + dim_kernel_x); i_ker_x++) + { + if((i_ker_y < 0) || (i_ker_y >= dim_in_y) || (i_ker_x < 0) || (i_ker_x >= dim_in_x)) + { + xpulp_nn_zero_mem_u4(pIm2Col, ch_in); + } + else + { + xpulp_nn_im2col_u4_to_u4((uint8_t*) (pIn + ((i_ker_y * dim_in_x + i_ker_x) * ch_in_r)), pIm2Col, ch_in); + } + pIm2Col+=PACK_INT4_SIZE(ch_in); + } + } + } + else if(i_out_y < dim_out_y - padding_y_bottom) + { + if(i_out_x < padding_x_left) + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x=((i_out_x * stride_x) - padding_x_left); i_ker_x<((i_out_x * stride_x) - padding_x_left + dim_kernel_x); i_ker_x++) + { + if((i_ker_x < 0) || (i_ker_x >= dim_in_x)) + { + xpulp_nn_zero_mem_u4(pIm2Col, ch_in); + } + else + { + xpulp_nn_im2col_u4_to_u4((uint8_t*) (pIn + ((i_ker_y * dim_in_x + i_ker_x) * ch_in_r)), pIm2Col, ch_in); + } + pIm2Col+=PACK_INT4_SIZE(ch_in); + } + } + } + else if(i_out_x < (dim_out_x - padding_x_right)) + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + xpulp_nn_im2col_u4_to_u4((uint8_t*) pIn + (i_ker_y * dim_in_x + i_out_x * stride_x - padding_x_left)*ch_in_r,pIm2Col,ch_in * dim_kernel_x); + pIm2Col+=PACK_INT4_SIZE(ch_in * dim_kernel_x); + } + } + else + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x = i_out_x * stride_x - padding_x_left; i_ker_x < i_out_x * stride_x - padding_x_left + dim_kernel_x; i_ker_x++) + { + if((i_ker_x < 0) || (i_ker_x >= dim_in_x)) + { + xpulp_nn_zero_mem_u4(pIm2Col, ch_in); + } + else + { + xpulp_nn_im2col_u4_to_u4((uint8_t *)pIn + (i_ker_y*dim_in_x+i_ker_x)* ch_in_r, pIm2Col, ch_in); + } + pIm2Col+=PACK_INT4_SIZE(ch_in); + } + } + } + } + else + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x = i_out_x * stride_x - padding_x_left; i_ker_x < i_out_x * stride_x - padding_x_left + dim_kernel_x; i_ker_x++) + { + if(i_ker_y < 0 || (i_ker_y >= dim_in_y) || i_ker_x < 0 || i_ker_x >= dim_in_x) + { + xpulp_nn_zero_mem_u4(pIm2Col, ch_in); + } + else + { + xpulp_nn_im2col_u4_to_u4((uint8_t *) pIn + (i_ker_y * dim_in_x + i_ker_x) * ch_in_r, pIm2Col, ch_in); + } + pIm2Col+=PACK_INT4_SIZE(ch_in); + } + } + } + if(pIm2Col == (pIm2ColBase + ((PACK_INT4_SIZE(ch_in) * dim_kernel_x * dim_kernel_y) << 1))) + { + pOutBuffer = xpulp_nn_matmul_u4_u8_i2( + pIm2ColBase, + pBias, + pOutBuffer, + pOutBuffer + ch_out_r, + pWeight, + pKappa, + pLambda, + out_mult, + out_shift, + (ch_in * dim_kernel_x * dim_kernel_y), + ch_out, + flag_relu, + flag_batch_norm + ); + + pIm2Col = pIm2ColBase; + } + } + + if(pIm2Col != pIm2ColBase) + { + const int8_t *pA = pWeight; + int i; + int32_t * k1 = pKappa; + int32_t * lambda1 = pLambda; + + v4s inA[2]; + uint8_t out[1]; + uint16_t num_col_im2col = ch_in * dim_kernel_x * dim_kernel_y; + uint16_t num_col_im2col_w = PACK_INT2_SIZE(ch_in) * dim_kernel_x * dim_kernel_y; + + for(i = 0; i < ch_out; i++) + { + int sum = 0; + if (pBias != NULL) + { + sum = *((int*) pBias); + pBias+= 4; + } + + uint8_t *pB = pIm2ColBase; + + int32_t *ptrA = (int32_t *)pA; + uint32_t *ptrB = (uint32_t *)pB; + + for(int j=0; j < (num_col_im2col >> 4); j++) + { + pA = pulp_nn_i2_to_i4(pA,inA); + + ptrA = (int32_t *)inA; + + sum = SumDotp8(*(uint32_t *)ptrB, *(int32_t *)ptrA, sum); + + ptrA++; + ptrB++; + + sum = SumDotp8(*(uint32_t *)ptrB, *(int32_t *)ptrA, sum); + + ptrA++; + ptrB++; + } + + int col_cnt_im2col = num_col_im2col & 0xf; + + if(col_cnt_im2col) + { + + uint16_t loop_cnt_im2col_a = (num_col_im2col >> 4) << 3; + pB+=loop_cnt_im2col_a; + + do + { + int8_t inA1 = (int8_t) bitext((int) *pA, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((uint32_t) *pB, 4, 0); + sum += inA1 * inB1; + inA1 = (int8_t) bitext((int) *pA, 2, 2); + inB1 = (uint8_t) bitextu((uint32_t) *pB, 4, 4); + sum += inA1 * inB1; + pB++; + inA1 = (int8_t) bitext((int) *pA, 2, 4); + inB1 = (uint8_t) bitextu((uint32_t) *pB, 4, 0); + sum += inA1 * inB1; + inA1 = (int8_t) bitext((int) *pA, 2, 6); + inB1 = (uint8_t) bitextu((uint32_t) *pB, 4, 4); + sum += inA1 * inB1; + + pA++; + pB++; + col_cnt_im2col-=4; + } while(col_cnt_im2col); + } + if (flag_batch_norm && flag_relu) + { + *pOutBuffer = pulp_nn_bn_quant_u8(sum, *k1, *lambda1, out_shift); + k1++; + lambda1++; + pOutBuffer++; + } + else + { + if(flag_relu == 1) + { + *pOutBuffer = pulp_nn_quant_u8(sum, out_mult, out_shift); + pOutBuffer++; + } + else + { + *pOutBuffer = (uint8_t) clip8(sum >> out_shift); + pOutBuffer++; + } + } + } + } + pOutBuffer+=(extra_chunk * ((dim_out_x_r + ((1 - section) * flag_dim_out_x_odd)) * ch_out_r)); + pIm2Col = pIm2ColBase; + } + pi_cl_team_barrier(); +} diff --git a/rt_nn_tests/xpnn_conv/xpulp_nn_matmul_u4_u8_i2.c b/rt_nn_tests/xpnn_conv/xpulp_nn_matmul_u4_u8_i2.c new file mode 100644 index 0000000..5713be9 --- /dev/null +++ b/rt_nn_tests/xpnn_conv/xpulp_nn_matmul_u4_u8_i2.c @@ -0,0 +1,490 @@ +/* + * xpulp_nn_matmul_u4_u8_i2.c + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" +#include "pulp_nn_utils.h" + + + +uint8_t * __attribute__((noinline)) xpulp_nn_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mult, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batch_norm) +{ + int32_t vecA[2]; + int32_t vecA2[2]; + int32_t vecA3[2]; + int32_t vecA4[2]; + + uint16_t ch_out_r = PACK_INT8_SIZE(ch_out); + + uint16_t num_col_im2col_w = PACK_INT2_SIZE(num_col_im2col); + uint16_t num_col_im2col_a = PACK_INT4_SIZE(num_col_im2col); + + //uint8_t *pOut2 = pOut + ch_out_r; + int8_t *pA = pWeight; + + uint16_t chan_left = ch_out & 0x3; + + for(int i=0; i < (ch_out >> 2); i++) + { + uint8_t *pB = pIn; + uint8_t *pB2 = (pB + num_col_im2col_a); + + uint32_t *ptrB = (uint32_t *) pB; + uint32_t *ptrB2 = (uint32_t *) pB2; + + int8_t *pA2 = (pA + num_col_im2col_w); + int8_t *pA3 = (pA2 + num_col_im2col_w); + int8_t *pA4 = (pA3 + num_col_im2col_w); + + pA = pulp_nn_i2_to_i4(pA , vecA); + pA2 = pulp_nn_i2_to_i4(pA2, vecA2); + pA3 = pulp_nn_i2_to_i4(pA3, vecA3); + pA4 = pulp_nn_i2_to_i4(pA4, vecA4); + + int32_t *startA; + int32_t *startA2; + int32_t *startA3; + int32_t *startA4; + + asm volatile("mv %0, %1":"=r"(startA):"r"(vecA)); + asm volatile("mv %0, %1":"=r"(startA2):"r"(vecA2)); + asm volatile("mv %0, %1":"=r"(startA3):"r"(vecA3)); + asm volatile("mv %0, %1":"=r"(startA4):"r"(vecA4)); + + int32_t *ptrA = (int32_t *) vecA ; + int32_t *ptrA2 = (int32_t *) vecA2; + int32_t *ptrA3 = (int32_t *) vecA3; + int32_t *ptrA4 = (int32_t *) vecA4; + + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + ptrA2 = MacLoadInit(1, 0, 1, 0, ptrA2); + ptrA3 = MacLoadInit(1, 0, 2, 0, ptrA3); + ptrA4 = MacLoadInit(1, 0, 3, 0, ptrA4); + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + + int sum = 0; + int sum2 = 0; + int sum3 = 0; + int sum4 = 0; + int sum5 = 0; + int sum6 = 0; + int sum7 = 0; + int sum8 = 0; + + + if (pBias != NULL) + { + sum = *((int*) pBias); + pBias+= 4; + sum2 = *((int*) pBias); + pBias+= 4; + sum3 = *((int*) pBias); + pBias+= 4; + sum4 = *((int*) pBias); + pBias+= 4; + + sum5 = sum; + sum6 = sum2; + sum7 = sum3; + sum8 = sum4; + } + + for(int j=0; j<(num_col_im2col >> 4); j++) + { + ptrB2 = MacLoadInit(0, 1, 0, 1, ptrB2); + + sum = MacLoad8(0, 0, 0, 0, ptrA, sum); + sum2 = MacLoad8(0, 0, 1, 0, ptrA2, sum2); + sum3 = MacLoad8(0, 0, 2, 0, ptrA3, sum3); + sum4 = MacLoad8(0, 1, 3, 0, ptrB, sum4); + ptrB = MacLoadUpdate(ptrB); + + + sum5 = MacLoad8(1, 0, 0, 1, ptrA, sum5); + ptrA = MacLoadUpdate(ptrA); + + sum6 = MacLoad8(1, 0, 1, 1, ptrA2, sum6); + ptrA2 = MacLoadUpdate(ptrA2); + + sum7 = MacLoad8(1, 0, 2, 1, ptrA3, sum7); + ptrA3 = MacLoadUpdate(ptrA3); + + sum8 = MacLoad8(1, 0, 3, 1, ptrA4, sum8); + ptrA4 = MacLoadUpdate(ptrA4); + + ptrB2 = MacLoadInit(0, 1, 0, 1, ptrB2); + + sum = MacLoad8(0, 0, 0, 0, ptrA, sum); + sum2 = MacLoad8(0, 0, 1, 0, ptrA2, sum2); + sum3 = MacLoad8(0, 0, 2, 0, ptrA3, sum3); + sum4 = MacLoad8(0, 1, 3, 0, ptrB, sum4); + ptrB = MacLoadUpdate(ptrB); + + pA = pulp_nn_i2_to_i4(pA , vecA); + pA2 = pulp_nn_i2_to_i4(pA2, vecA2); + pA3 = pulp_nn_i2_to_i4(pA3, vecA3); + pA4 = pulp_nn_i2_to_i4(pA4, vecA4); + + ptrA = MacLoadAssign(vecA); + ptrA2 = MacLoadAssign(vecA2); + ptrA3 = MacLoadAssign(vecA3); + ptrA4 = MacLoadAssign(vecA4); + + sum5 = MacLoad8(1, 0, 0, 1, ptrA, sum5); + ptrA = MacLoadUpdate(ptrA); + + sum6 = MacLoad8(1, 0, 1, 1, ptrA2, sum6); + ptrA2 = MacLoadUpdate(ptrA2); + + sum7 = MacLoad8(1, 0, 2, 1, ptrA3, sum7); + ptrA3 = MacLoadUpdate(ptrA3); + + sum8 = MacLoad8(1, 0, 3, 1, ptrA4, sum8); + ptrA4 = MacLoadUpdate(ptrA4); + } + pA-=4; + pA2-=4; + pA3-=4; + pA4-=4; + + int col_cnt_im2col = num_col_im2col & 0xf; + + if(col_cnt_im2col) + { + + uint16_t loop_cnt_im2col_a = (num_col_im2col >> 4) << 3; + pB+=loop_cnt_im2col_a; + pB2+=loop_cnt_im2col_a; + + do + { + int8_t inA = (int8_t) bitext((int) *pA, 2, 0); + int8_t inA2 = (int8_t) bitext((int) *pA2, 2, 0); + int8_t inA3 = (int8_t) bitext((int) *pA3, 2, 0); + int8_t inA4 = (int8_t) bitext((int) *pA4, 2, 0); + + uint8_t inB = (uint8_t)bitextu((uint32_t) *pB, 4, 0); + uint8_t inB2 = (uint8_t)bitextu((uint32_t) *pB2, 4, 0); + + sum += inA * inB; + sum2 += inA2 * inB; + sum3 += inA3 * inB; + sum4 += inA4 * inB; + + sum5 += inA * inB2; + sum6 += inA2 * inB2; + sum7 += inA3 * inB2; + sum8 += inA4 * inB2; + + inA = (int8_t) bitext((int) *pA, 2, 2); + inA2 = (int8_t) bitext((int) *pA2, 2, 2); + inA3 = (int8_t) bitext((int) *pA3, 2, 2); + inA4 = (int8_t) bitext((int) *pA4, 2, 2); + + inB = (uint8_t)bitextu((uint32_t) *pB, 4, 4); + inB2 = (uint8_t)bitextu((uint32_t) *pB2, 4, 4); + + sum += inA * inB; + sum2 += inA2 * inB; + sum3 += inA3 * inB; + sum4 += inA4 * inB; + + sum5 += inA * inB2; + sum6 += inA2 * inB2; + sum7 += inA3 * inB2; + sum8 += inA4 * inB2; + + pB++; + pB2++; + + inA = (int8_t) bitext((int) *pA, 2, 4); + inA2 = (int8_t) bitext((int) *pA2, 2, 4); + inA3 = (int8_t) bitext((int) *pA3, 2, 4); + inA4 = (int8_t) bitext((int) *pA4, 2, 4); + + inB = (uint8_t)bitextu((uint32_t) *pB, 4, 0); + inB2 = (uint8_t)bitextu((uint32_t) *pB2, 4, 0); + + sum += inA * inB; + sum2 += inA2 * inB; + sum3 += inA3 * inB; + sum4 += inA4 * inB; + + sum5 += inA * inB2; + sum6 += inA2 * inB2; + sum7 += inA3 * inB2; + sum8 += inA4 * inB2; + + inA = (int8_t) bitext((int) *pA, 2, 6); + inA2 = (int8_t) bitext((int) *pA2, 2, 6); + inA3 = (int8_t) bitext((int) *pA3, 2, 6); + inA4 = (int8_t) bitext((int) *pA4, 2, 6); + + inB = (uint8_t)bitextu((uint32_t) *pB, 4, 4); + inB2 = (uint8_t)bitextu((uint32_t) *pB2, 4, 4); + + sum += inA * inB; + sum2 += inA2 * inB; + sum3 += inA3 * inB; + sum4 += inA4 * inB; + + sum5 += inA * inB2; + sum6 += inA2 * inB2; + sum7 += inA3 * inB2; + sum8 += inA4 * inB2; + + pA++; + pA2++; + pA3++; + pA4++; + + pB++; + pB2++; + + col_cnt_im2col-=4; + } while(col_cnt_im2col > 0); + } + if (flag_batch_norm && flag_relu) + { + *pOut = pulp_nn_bn_quant_u8(sum, *pKappa, *pLambda, out_shift); + pOut++; + *pOut2 = pulp_nn_bn_quant_u8(sum5, *pKappa, *pLambda, out_shift); + pOut2++; + pKappa++; + pLambda++; + + *pOut = pulp_nn_bn_quant_u8(sum2, *pKappa, *pLambda, out_shift); + pOut++; + *pOut2 = pulp_nn_bn_quant_u8(sum6, *pKappa, *pLambda, out_shift); + pOut2++; + pKappa++; + pLambda++; + + *pOut = pulp_nn_bn_quant_u8(sum3, *pKappa, *pLambda, out_shift); + pOut++; + *pOut2 = pulp_nn_bn_quant_u8(sum7, *pKappa, *pLambda, out_shift); + pOut2++; + pKappa++; + pLambda++; + + *pOut = pulp_nn_bn_quant_u8(sum4, *pKappa, *pLambda, out_shift); + pOut++; + *pOut2 = pulp_nn_bn_quant_u8(sum8, *pKappa, *pLambda, out_shift); + pOut2++; + pKappa++; + pLambda++; + } + else + { + if (flag_relu == 1) + { + *pOut = pulp_nn_quant_u8(sum, out_mult, out_shift); + pOut++; + *pOut = pulp_nn_quant_u8(sum2, out_mult, out_shift); + pOut++; + *pOut = pulp_nn_quant_u8(sum3, out_mult, out_shift); + pOut++; + *pOut = pulp_nn_quant_u8(sum4, out_mult, out_shift); + pOut++; + + *pOut2 = pulp_nn_quant_u8(sum5, out_mult, out_shift); + pOut2++; + *pOut2 = pulp_nn_quant_u8(sum6, out_mult, out_shift); + pOut2++; + *pOut2 = pulp_nn_quant_u8(sum7, out_mult, out_shift); + pOut2++; + *pOut2 = pulp_nn_quant_u8(sum8, out_mult, out_shift); + pOut2++; + + } + else + { + *pOut = (uint8_t) clip8(sum >> out_shift); + pOut++; + *pOut = (uint8_t) clip8(sum2 >> out_shift); + pOut++; + *pOut = (uint8_t) clip8(sum3 >> out_shift); + pOut++; + *pOut = (uint8_t) clip8(sum4 >> out_shift); + pOut++; + + *pOut2 = (uint8_t) clip8(sum5 >> out_shift); + pOut2++; + *pOut2 = (uint8_t) clip8(sum6 >> out_shift); + pOut2++; + *pOut2 = (uint8_t) clip8(sum7 >> out_shift); + pOut2++; + *pOut2 = (uint8_t) clip8(sum8 >> out_shift); + pOut2++; + + } + } + pA+=(3 * num_col_im2col_w); + } + while(chan_left) + { + uint8_t *pB = pIn; + uint8_t *pB2 = (pB + num_col_im2col_a); + + uint32_t *ptrB = (uint32_t *) pB; + uint32_t *ptrB2 = (uint32_t *) pB2; + + pA = pulp_nn_i2_to_i4(pA , vecA); + + int32_t *startA; + + asm volatile("mv %0, %1":"=r"(startA):"r"(vecA)); + + int32_t *ptrA = (int32_t *) vecA; + + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + + int sum = 0; + if (pBias != NULL) + { + sum = *((int*) pBias++); + } + int sum2 = sum; + + for(int j=0; j < (num_col_im2col >> 4); j++) + { + ptrB2 = MacLoadInit(0, 1, 0, 1, ptrB2); + + sum = MacLoad8(0, 1, 0, 0, ptrB, sum); + ptrB = MacLoadUpdate(ptrB); + + sum2 = MacLoad8(1, 0, 0, 1, ptrA, sum2); + ptrA = MacLoadUpdate(ptrA); + ptrB2 = MacLoadInit(0, 1, 0, 1, ptrB2); + + sum = MacLoad8(0, 1, 0, 0, ptrB, sum); + ptrB = MacLoadUpdate(ptrB); + + pA = pulp_nn_i2_to_i4(pA , vecA); + + ptrA = MacLoadAssign(vecA); + + sum2 = MacLoad8(1, 0, 0, 1, ptrA, sum2); + ptrA = MacLoadUpdate(ptrA); + } + pA-=4; + int col_cnt_im2col = num_col_im2col & 0xf; + + if(col_cnt_im2col) + { + + uint16_t loop_cnt_im2col_a = (num_col_im2col >> 4) << 3; + pB+=loop_cnt_im2col_a; + pB2+=loop_cnt_im2col_a; + + do + { + int8_t inA = (int8_t) bitext((int) *pA, 2, 0); + + uint8_t inB = (uint8_t)bitextu((uint32_t) *pB, 4, 0); + uint8_t inB2 = (uint8_t)bitextu((uint32_t) *pB2, 4, 0); + + sum += inA * inB; + + sum2 += inA * inB2; + + inA = (int8_t) bitext((int) *pA, 2, 2); + + inB = (uint8_t)bitextu((uint32_t) *pB, 4, 4); + inB2 = (uint8_t)bitextu((uint32_t) *pB2, 4, 4); + + sum += inA * inB; + + sum2 += inA * inB2; + + pB++; + pB2++; + + inA = (int8_t) bitext((int) *pA, 2, 4); + + inB = (uint8_t)bitextu((uint32_t) *pB, 4, 0); + inB2 = (uint8_t)bitextu((uint32_t) *pB2, 4, 0); + + sum += inA * inB; + + sum2 += inA * inB2; + + inA = (int8_t) bitext((int) *pA, 2, 6); + + inB = (uint8_t)bitextu((uint32_t) *pB, 4, 4); + inB2 = (uint8_t)bitextu((uint32_t) *pB2, 4, 4); + + sum += inA * inB; + + sum2 += inA * inB2; + + pA++; + + pB++; + pB2++; + + col_cnt_im2col-=4; + } while(col_cnt_im2col > 0); + } + if (flag_batch_norm && flag_relu) + { + *pOut = pulp_nn_bn_quant_u8(sum, *pKappa, *pLambda, out_shift); + pOut++; + *pOut2 = pulp_nn_bn_quant_u8(sum2, *pKappa, *pLambda, out_shift); + pOut2++; + pKappa++; + pLambda++; + } + else + { + if (flag_relu == 1) + { + *pOut = pulp_nn_quant_u8(sum, out_mult, out_shift); + pOut++; + *pOut2 = pulp_nn_quant_u8(sum2, out_mult, out_shift); + pOut2++; + } + else + { + *pOut = (uint8_t) clip8(sum >> out_shift); + pOut++; + *pOut2 = (uint8_t) clip8(sum2 >> out_shift); + pOut2++; + } + } + chan_left--; + } + pOut+=ch_out_r; + return pOut; +} diff --git a/rt_nn_tests/xpnn_maxpool_2b/Makefile b/rt_nn_tests/xpnn_maxpool_2b/Makefile new file mode 100644 index 0000000..ed30544 --- /dev/null +++ b/rt_nn_tests/xpnn_maxpool_2b/Makefile @@ -0,0 +1,18 @@ +APP = test +PULP_APP = test +PULP_APP_SRCS = test.c +PULP_APP_SRCS += xpulp_nn_maxpool_i2.c + +CORE=8 + + +PULP_CFLAGS += -DNUM_CORES=$(CORE) -I. -O3 +PULP_LDFLAGS += -lc -lm -lgcc -Wl,-print-memory-usage + +PULP_APP_CFLAGS += -DNUM_CORES=$(CORE) -I. -O3 +PULP_APP_LDFLAGS += -lc -lm -lgcc -Wl,-print-memory-usage + +PULP_CFLAGS += -DPULP_RUNTIME -DSINGLE_CORE_DMA -DALWAYS_BLOCK_DMA_TRANSFERS + + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/rt_nn_tests/xpnn_maxpool_2b/data_statstest.h b/rt_nn_tests/xpnn_maxpool_2b/data_statstest.h new file mode 100644 index 0000000..91291ed --- /dev/null +++ b/rt_nn_tests/xpnn_maxpool_2b/data_statstest.h @@ -0,0 +1,10270 @@ +// this file is generated automatically by the python script `generate_statstest.py` + +#ifndef _DATA_H +#define _DATA_H + + + +#define IM2COL_DIM (32 * NUM_CORES * 2) // for NUM_CORES cores +#define OUTPUT_DIM 2048 +#define OUTPUT_DIM_FP (1) + +uint8_t pIn_0 [] = { +0xb7, +0xd3, +0x41, +0xbd, +0xa7, +0x84, +0xc9, +0x6c, +0xb7, +0x85, +0xef, +0xc3, +0xe6, +0xd1, +0x57, +0xda, +0x0a, +0x5a, +0x40, +0xcd, +0x0b, +0xd9, +0xf0, +0xbd, +0x01, +0x9c, +0x1a, +0x06, +0x55, +0xf8, +0xee, +0xcf, +0x62, +0x05, +0xdc, +0x27, +0xac, +0x99, +0xc0, +0xc4, +0xc3, +0x9a, +0x69, +0x04, +0x51, +0xf6, +0xf3, +0x08, +0x91, +0x8e, +0xd7, +0xae, +0x9d, +0xa6, +0x43, +0x4e, +0x72, +0xfb, +0xdd, +0x59, +0x39, +0x37, +0x69, +0x9b, +0x57, +0x94, +0xd7, +0x7b, +0x42, +0x0a, +0x9e, +0x3f, +0xe7, +0xeb, +0x54, +0x43, +0xc8, +0x0a, +0xa5, +0x03, +0x9e, +0x11, +0x6b, +0x3a, +0x7e, +0x6f, +0x6c, +0x18, +0xe7, +0x87, +0x7d, +0x6d, +0x42, +0x7b, +0x84, +0x90, +0x13, +0x38, +0xbc, +0xbb, +0x73, +0x80, +0x7a, +0x66, +0x38, +0x73, +0x2a, +0xed, +0x6f, +0x17, +0x86, +0x41, +0x16, +0x60, +0xe3, +0x2e, +0x41, +0x88, +0x91, +0x21, +0x4b, +0x9e, +0x5c, +0xdf, +0x7d, +0xbd, +0x87, +0xcb, +0xae, +0x7f, +0xa0, +0xf4, +0x09, +0xad, +0xbd, +0xb1, +0x48, +0xd9, +0x96, +0xfc, +0x9a, +0x92, +0xaf, +0x92, +0x0a, +0x49, +0x0c, +0x40, +0xf0, +0xa7, +0xb2, +0xbd, +0x1c, +0x91, +0x5e, +0x47, +0xcc, +0x4e, +0x9a, +0xc9, +0x16, +0x50, +0x50, +0xb9, +0xb6, +0x6f, +0xea, +0x1b, +0x75, +0xb7, +0x5d, +0x5a, +0xec, +0x47, +0x9a, +0x89, +0x36, +0xb2, +0x2e, +0x97, +0x4c, +0xd8, +0x3e, +0xb7, +0xfa, +0xaa, +0x4f, +0x2e, +0xe0, +0x68, +0x1d, +0xd7, +0xc9, +0xa3, +0x65, +0xa4, +0xff, +0x0a, +0xa3, +0x1f, +0xc1, +0x92, +0xf1, +0x17, +0x70, +0x55, +0xb7, +0x9c, +0x7c, +0xb2, +0x11, +0x1c, +0xc5, +0x0a, +0xfd, +0x6e, +0xfe, +0x5c, +0x28, +0xd9, +0x90, +0xf9, +0xbc, +0x5d, +0xf5, +0xf9, +0x9f, +0x47, +0x52, +0x49, +0x79, +0xcb, +0x8c, +0xec, +0x2c, +0xe9, +0x84, +0xe3, +0x66, +0x50, +0x18, +0xb7, +0x8a, +0x57, +0xd5, +0x8f, +0xb4, +0xc9, +0x37, +0xe2, +0x15, +0x49, +0x4d, +0xac, +0x05, +0x6c, +0xbe, +0x1a, +0x50, +0xf8, +0xa8, +0x94, +0x60, +0x3d, +0xfa, +0x90, +0x6d, +0x5b, +0x27, +0x22, +0x77, +0xa5, +0x20, +0x9b, +0xd2, +0xf1, +0x5c, +0x03, +0x8c, +0x04, +0x53, +0xcf, +0x1c, +0x6a, +0x0d, +0xfd, +0xba, +0x8a, +0x55, +0x95, +0x25, +0x65, +0x2c, +0x4d, +0x4b, +0x77, +0x6c, +0xc6, +0x46, +0xf1, +0x57, +0x2c, +0x1d, +0x41, +0x8b, +0xf4, +0x08, +0x62, +0x26, +0xd6, +0x14, +0x65, +0xe3, +0x8c, +0x63, +0x34, +0x7c, +0x64, +0xb1, +0x0e, +0xfb, +0x1b, +0x6c, +0x9c, +0x19, +0xff, +0xcd, +0xfc, +0x7d, +0xf3, +0x8c, +0x4b, +0x61, +0xb8, +0xeb, +0xd9, +0x6b, +0x5b, +0x79, +0xb4, +0x3e, +0x10, +0x4c, +0x1b, +0x98, +0xd2, +0x1a, +0x62, +0xf7, +0x30, +0xea, +0xe6, +0xcf, +0x1c, +0x46, +0xba, +0x57, +0x1e, +0x4e, +0x73, +0x02, +0xbb, +0x5b, +0x27, +0x1b, +0xe8, +0xe8, +0x7b, +0x20, +0x95, +0x70, +0xa9, +0x87, +0x0a, +0x36, +0x83, +0xc8, +0x1a, +0x03, +0x8e, +0xd9, +0x10, +0x9c, +0x04, +0x14, +0xe2, +0xed, +0x31, +0x43, +0xd2, +0x76, +0x75, +0x5f, +0x30, +0xc5, +0x4c, +0x7b, +0x86, +0xdd, +0x35, +0x42, +0xbe, +0x90, +0xad, +0xb6, +0x3a, +0x1c, +0x7f, +0x67, +0xfd, +0x9f, +0xa3, +0xd7, +0xf3, +0xdf, +0xbb, +0x8d, +0x16, +0x20, +0x51, +0xa0, +0xe7, +0x9b, +0x68, +0x0f, +0x75, +0xfd, +0xbc, +0xa3, +0x58, +0x98, +0x7b, +0xaa, +0xb2, +0x8b, +0xba, +0x6f, +0xe3, +0x36, +0xd8, +0x89, +0xbb, +0xbf, +0xf1, +0x97, +0x5c, +0x9b, +0xa4, +0xb8, +0x83, +0xc0, +0x39, +0x2a, +0x78, +0xba, +0xbf, +0x87, +0x92, +0xcd, +0xe8, +0x96, +0x56, +0xa9, +0x18, +0x57, +0xc0, +0xf3, +0x19, +0x0b, +0x70, +0x84, +0xa7, +0x60, +0x18, +0x64, +0x55, +0x44, +0xc8, +0xcb, +0xa7, +0x19, +0xb4, +0x80, +0x28, +0x68, +0xcb, +0xfb, +0x05, +0xbc, +0xce, +0x90, +0x58, +0xdb, +0x1b, +0x2e, +0xd4, +0x7a, +0xd2, +0x73, +0x64, +0x4e, +0x5a, +0x88, +0x55, +0xc1, +0x27, +0x23, +0x77, +0xc8, +0xbb, +0x3f, +0x1b, +0x06, +0xbf, +0x30, +0x41, +0xbb, +0xcc, +0xa7, +0xda, +0x58, +0x20, +0xe5, +0x85, +0xc6, +0x08, +0x16, +0x4d, +0x2a, +0x1e, +0x0b, +0x7a, +0x99, +0xf9, +0xc7, +0x31, +0x2f, +0xc5, +0xea, +0xc1, +0xb1, +0xb2, +0x62, +0x90, +0xcd, +0xa3, +0x1f, +0x10, +0xb4, +0xe8, +0x18, +0x4d, +0xa8, +0x6e, +0x3c, +0xf2, +0xbc, +0x8d, +0xcd, +0xcb, +0xce, +0x6d, +0x31, +0x13, +0xad, +0x38, +0xb1, +0x7a, +0xc8, +0x53, +0x86, +0x5c, +0x85, +0x4e, +0x85, +0x91, +0x19, +0xc0, +0x2f, +0xcd, +0xdb, +0x3a, +0x87, +0xfa, +0x72, +0x70, +0x2b, +0xdb, +0x5f, +0xdc, +0x50, +0xf5, +0xf9, +0x77, +0x01, +0xb1, +0x67, +0x60, +0xd5, +0xdc, +0x8f, +0x6c, +0xaf, +0x16, +0xb7, +0xdb, +0x05, +0xc3, +0x54, +0x3e, +0x04, +0xa6, +0xe2, +0x6b, +0xb4, +0x65, +0x6d, +0x29, +0xf3, +0x2d, +0x91, +0xa7, +0xec, +0xa5, +0x25, +0x27, +0x5f, +0xd3, +0x75, +0x32, +0x70, +0x59, +0xbe, +0x04, +0x15, +0xcd, +0xf8, +0xc9, +0x98, +0xfa, +0x5f, +0x01, +0xe7, +0x7c, +0x4b, +0xe0, +0xfa, +0x97, +0xea, +0xea, +0xbb, +0x7d, +0xbb, +0x5d, +0x72, +0x93, +0xac, +0xb6, +0xac, +0x90, +0xdd, +0x42, +0xd1, +0xb6, +0xf6, +0x4a, +0x0e, +0x70, +0x36, +0x31, +0x10, +0x3f, +0x5a, +0xf9, +0xa8, +0xfd, +0x57, +0x84, +0x05, +0x16, +0x1a, +0x3d, +0x10, +0x58, +0x76, +0x49, +0xb7, +0xf0, +0x23, +0xdb, +0x71, +0xfb, +0x4a, +0xa5, +0x27, +0xec, +0xde, +0x65, +0xa5, +0x51, +0xa1, +0x07, +0xab, +0x10, +0x8d, +0x6a, +0x80, +0x61, +0x27, +0x22, +0xc0, +0x61, +0x4e, +0x51, +0x4f, +0x21, +0x44, +0x8a, +0xdb, +0x26, +0xee, +0x78, +0xed, +0xa2, +0xdd, +0x59, +0xc0, +0x7a, +0xa0, +0x09, +0x7d, +0x83, +0x4b, +0x2b, +0x09, +0x80, +0x9a, +0x2e, +0x4a, +0x9b, +0x28, +0x53, +0x79, +0xc4, +0xf0, +0x6e, +0x0e, +0xff, +0x2e, +0x15, +0xf8, +0xa0, +0xec, +0xb0, +0xd3, +0xd3, +0x16, +0x5f, +0xdd, +0x80, +0xfe, +0xd3, +0x2b, +0xb7, +0x8e, +0x53, +0x0b, +0x20, +0x1d, +0xeb, +0x52, +0x05, +0xb3, +0x7f, +0x0a, +0xf1, +0x2f, +0x11, +0x86, +0xb2, +0xf2, +0xfc, +0x42, +0x87, +0xd2, +0x4f, +0x50, +0x5f, +0x9e, +0x73, +0x15, +0xaf, +0xc5, +0x94, +0x14, +0xfa, +0x65, +0x12, +0xf0, +0x6a, +0x43, +0xa8, +0x66, +0x11, +0x8d, +0xf9, +0xde, +0x96, +0xac, +0x72, +0x9d, +0x16, +0x7a, +0x3a, +0x2a, +0xdd, +0x7c, +0x3c, +0xfe, +0x11, +0x35, +0xe7, +0xfd, +0x8b, +0xeb, +0xe1, +0x33, +0x92, +0x59, +0xb9, +0xff, +0x52, +0x1c, +0x41, +0x95, +0x11, +0xcb, +0x5f, +0xc9, +0x90, +0xf0, +0x50, +0x48, +0xa6, +0x99, +0x53, +0x3d, +0x7a, +0x79, +0x78, +0xd9, +0x65, +0x40, +0x86, +0x3a, +0x3b, +0x32, +0xcb, +0xb7, +0x32, +0x21, +0x7c, +0x4d, +0x80, +0xba, +0xf6, +0x29, +0x13, +0x2d, +0xb0, +0xae, +0xdf, +0x32, +0x5c, +0x9f, +0x47, +0xc1, +0xd1, +0xb9, +0xf3, +0xda, +0x67, +0x7d, +0x21, +0x02, +0xf0, +0x1e, +0x7e, +0xc0, +0x9d, +0xd4, +0x68, +0x4e, +0x27, +0x01, +0xe6, +0x1c, +0xe5, +0xa8, +0x30, +0xd8, +0x0d, +0x87, +0x5d, +0xb7, +0x36, +0x4e, +0x29, +0xae, +0x14, +0xe4, +0xab, +0x37, +0x81, +0xdb, +0x2a, +0x76, +0xb8, +0x75, +0x93, +0x39, +0x84, +0x2b, +0xb0, +0xd6, +0xa7, +0x1f, +0x31, +0x2e, +0xda, +0x09, +0xf4, +0xc8, +0xc5, +0x64, +0x10, +0xaa, +0xf4, +0xcf, +0x74, +0x3c, +0x98, +0xc8, +0x48, +0xb8, +0xea, +0xd2, +0x47, +0x1e, +0x96, +0xce, +0x71, +0x47, +0x44, +0x8f, +0xde, +0x7a, +0x01, +0x8d, +0x3d, +0x37, +0x1f, +0xd8, +0xc1, +0x21, +0xe3, +0xa2, +0x3d, +0x51, +0x82, +0x5e, +0x13, +0x4b, +0x08, +0x76, +0x6c, +0x17, +0x67, +0x9a, +0xfb, +0x21, +0x46, +0x97, +0xe0, +0xd8, +0xf3, +0x1a, +0x74, +0x5e, +0x48, +0xab, +0x33, +0xc7, +0x05, +0xa6, +0x9b, +0x42, +0x88, +0x31, +0xed, +0xf6, +0x7c, +0xe9, +0x07, +0x05, +0xea, +0xff, +0x0a, +0x43, +0x3f, +0xe2, +0x98, +0x30, +0x61, +0x17, +0xe9, +0x2f, +0x31, +0xae, +0x87, +0x0d, +0xe1, +0x66, +0x2f, +0x57, +0xe0, +0xe4, +0x34, +0x1b, +0xc7, +0x92, +0x4b, +0xee, +0x1a, +0x69, +0x73, +0x14, +0x4d, +0x51, +0x07, +0x04, +0xe5, +0x97, +0x91, +0xea, +0xa9, +0x7b, +0x06, +0xb9, +0x54, +0xbe, +0x2f, +0x8d, +0x6e, +0x4f, +0xca, +0x8d, +0x17, +0xb8, +0x18, +0x2e, +0xa5, +0x31, +0x9f, +0x9b, +0x28, +0x21, +0x5f, +0xcc, +0x95, +0xe2, +0x79, +0x72, +0x0c, +0xcb, +0x6a, +0x5f, +0xa2, +0x28, +0x8e, +0x8e, +0x85, +0x2d, +0xcc, +0x59, +0x0e, +0xaf, +0x00, +0x34, +0xad, +0xda, +0x9a, +0x0a, +0xda, +0x44, +0x07, +0x1e, +0x4b, +0x7c, +0x6e, +0xb7, +0xa9, +0x72, +0x66, +0x88, +0xc7, +0xe9, +0xd6, +0x72, +0x32, +0xc8, +0xb2, +0xe1, +0x9b, +0xa0, +0x39, +0x25, +0xe0, +0xd6, +0xf6, +0x85, +0xda, +0x5f, +0x8d, +0x13, +0xc9, +0x26, +0x1c, +0x21, +0x0c, +0x78, +0x72, +0x56, +0xb5, +0x7a, +0x0c, +0xb3, +0xd4, +0x04, +0xc0, +0x58, +0x63, +0x89, +0xfd, +0x2c, +0x02, +0x98, +0x91, +0x21, +0x80, +0xb7, +0xf6, +0x3b, +0x5e, +0x3d, +0x1e, +0xa5, +0x9d, +0xcc, +0xeb, +0x0f, +0x31, +0xf4, +0x3b, +0xbb, +0x06, +0x9b, +0x4d, +0x5c, +0xa7, +0xf5, +0xfb, +0x85, +0xca, +0xaf, +0xe4, +0x9e, +0x4e, +0x5e, +0x31, +0x53, +0x3d, +0x6e, +0x94, +0x4c, +0x5c, +0x3d, +0x7b, +0x92, +0xd6, +0x6b, +0x2f, +0x5b, +0x26, +0x2f, +0xe7, +0xc1, +0x79, +0xde, +0xe5, +0x4a, +0x73, +0xcb, +0xeb, +0x7d, +0x56, +0x9a, +0x49, +0x33, +0xcd, +0xb2, +0xc8, +0xa2, +0x4b, +0x47, +0x41, +0x6b, +0x6b, +0xb5, +0x66, +0x78, +0x05, +0xf4, +0xc9, +0xc2, +0x9e, +0x77, +0x69, +0x5f, +0x39, +0x50, +0xeb, +0x77, +0x6e, +0xe9, +0xb9, +0xd3, +0x22, +0xe2, +0xa9, +0x30, +0x03, +0xdb, +0xb2, +0xea, +0xae, +0x10, +0x8b, +0x46, +0x6b, +0x9c, +0xf8, +0xea, +0x88, +0x47, +0x62, +0x78, +0x89, +0x4c, +0x5e, +0x5e, +0xff, +0x7a, +0x4e, +0x24, +0x33, +0x0e, +0x4d, +0x43, +0x02, +0x68, +0x92, +0x1b, +0x6d, +0xe6, +0xe5, +0x94, +0xeb, +0x40, +0x8c, +0x77, +0x4c, +0x50, +0x36, +0x26, +0x9d, +0x9f, +0x6e, +0xea, +0x4c, +0xf5, +0xeb, +0xf1, +0x6c, +0x3c, +0x06, +0x9e, +0xea, +0x17, +0xcd, +0x48, +0x8f, +0x29, +0xec, +0x9a, +0x11, +0xb2, +0x47, +0xff, +0x75, +0x23, +0x22, +0x15, +0x2f, +0x5c, +0xa1, +0x7a, +0x8f, +0x20, +0x4b, +0x2f, +0xc6, +0x2c, +0x3b, +0x5d, +0x67, +0xbd, +0x67, +0xcf, +0x66, +0x87, +0x2a, +0x9a, +0x49, +0xcc, +0x49, +0x66, +0xd1, +0x17, +0xb1, +0x65, +0x0d, +0xe6, +0x14, +0x9a, +0x57, +0x84, +0x5a, +0x55, +0x00, +0x70, +0x32, +0x56, +0x09, +0xa7, +0x2a, +0xfd, +0x94, +0xb6, +0x81, +0x8c, +0x38, +0x46, +0x0b, +0x7f, +0x52, +0x53, +0x2a, +0x90, +0xd1, +0x1c, +0xd8, +0x35, +0xb6, +0xea, +0xec, +0xdc, +0xae, +0x16, +0xa6, +0x76, +0x29, +0x2d, +0xc7, +0x06, +0xb5, +0x10, +0x97, +0x28, +0x3c, +0xdd, +0x0b, +0x94, +0x08, +0x97, +0x82, +0xa1, +0x1b, +0x66, +0x86, +0x51, +0x03, +0x89, +0x1e, +0x2d, +0x5a, +0xb6, +0x56, +0xd8, +0xc9, +0x26, +0xc0, +0x51, +0x6a, +0xb6, +0x7c, +0x99, +0x32, +0xd3, +0x8c, +0xe8, +0x76, +0xb4, +0x87, +0x65, +0x99, +0x60, +0xfa, +0x6c, +0x17, +0xa5, +0x47, +0x31, +0x25, +0x0b, +0x25, +0x85, +0xc2, +0xb4, +0xb2, +0x38, +0x72, +0x4b, +0xe6, +0x59, +0x40, +0xf5, +0x70, +0x5c, +0x50, +0x86, +0xa4, +0xe1, +0x77, +0x3a, +0xe3, +0x4c, +0x7a, +0xc2, +0x9d, +0x08, +0x0c, +0x7c, +0xcb, +0x3b, +0xf1, +0xd0, +0x5f, +0x1a, +0xde, +0x31, +0x2b, +0xc0, +0xd2, +0x3f, +0xff, +0xe2, +0xbe, +0x3e, +0x07, +0xa5, +0x47, +0xe2, +0x1f, +0x2e, +0xcf, +0x87, +0xa3, +0xe2, +0x59, +0x71, +0x2c, +0x3a, +0xb4, +0xd0, +0x78, +0xff, +0xb1, +0xe3, +0x76, +0xa2, +0xf2, +0x05, +0xb2, +0xd7, +0x1f, +0x4b, +0x6c, +0x93, +0xe7, +0xf6, +0x29, +0x53, +0xec, +0x7f, +0xb9, +0x12, +0x30, +0x88, +0x01, +0xb5, +0xe9, +0x5b, +0x90, +0x37, +0x31, +0x3d, +0xba, +0x5f, +0x17, +0x20, +0x65, +0x84, +0xbf, +0x3b, +0x89, +0x0d, +0x28, +0xcb, +0x6e, +0x9f, +0x51, +0xb3, +0x58, +0xb0, +0x6f, +0xc3, +0x34, +0x03, +0x1f, +0x58, +0xf0, +0x7f, +0xe8, +0x13, +0x47, +0x7f, +0x55, +0x80, +0xeb, +0x55, +0x2f, +0x7e, +0xd1, +0x61, +0x35, +0x6c, +0x95, +0x87, +0x27, +0x7c, +0x83, +0x13, +0x6b, +0x3b, +0x81, +0xdf, +0x89, +0xcb, +0xa5, +0x22, +0xe5, +0x7a, +0x23, +0x35, +0x41, +0x26, +0x90, +0x6b, +0x62, +0x90, +0x2f, +0x21, +0x68, +0x5c, +0x2f, +0x0a, +0xb3, +0xaa, +0xed, +0x22, +0x75, +0x12, +0xff, +0x0e, +0xbf, +0xf7, +0x71, +0xca, +0xee, +0xa5, +0x56, +0x82, +0x65, +0x7c, +0xe2, +0x53, +0x53, +0x86, +0x9c, +0x36, +0x72, +0x13, +0x7b, +0x43, +0x64, +0xa5, +0x5a, +0x96, +0x30, +0xb7, +0x4f, +0xea, +0xcd, +0x23, +0x6a, +0x37, +0x35, +0x1d, +0x42, +0x0f, +0x8f, +0x1c, +0x52, +0xad, +0xa7, +0xa7, +0xfc, +0xc5, +0xa0, +0x21, +0xcd, +0x0d, +0x13, +0x4f, +0x9d, +0xad, +0xe9, +0x03, +0x5c, +0x36, +0x65, +0xff, +0xb1, +0xb8, +0xc5, +0xd4, +0x4d, +0x36, +0x47, +0x90, +0x65, +0xd1, +0xca, +0xf7, +0x60, +0x10, +0xdc, +0xf7, +0x32, +0xd2, +0xb9, +0xc6, +0x28, +0x63, +0x4a, +0x47, +0xf9, +0xc7, +0xd6, +0x2d, +0x29, +0xb1, +0x3c, +0xf4, +0x91, +0x62, +0xec, +0x17, +0x1c, +0x2c, +0xc3, +0xb9, +0x7e, +0x96, +0x49, +0xb0, +0x39, +0xc9, +0xd4, +0x2c, +0x8c, +0xc1, +0xe1, +0x60, +0x33, +0x97, +0x28, +0x26, +0x20, +0x57, +0x65, +0xb8, +0xf4, +0x7b, +0xcf, +0xa0, +0x29, +0x54, +0x9b, +0x82, +0x75, +0x35, +0x61, +0x15, +0xb0, +0xbd, +0x9c, +0xdd, +0x8d, +0x8c, +0x41, +0x45, +0x86, +0xdb, +0xa4, +0xdf, +0xbb, +0xf5, +0x52, +0x89, +0x23, +0xb7, +0x10, +0xba, +0x17, +0xf6, +0x56, +0x08, +0x2c, +0xac, +0x64, +0x32, +0x48, +0x16, +0x2e, +0x47, +0xce, +0x07, +0x7e, +0x42, +0x48, +0x0e, +0x24, +0x50, +0x49, +0xe3, +0xc9, +0xb5, +0x90, +0x66, +0x0b, +0xf2, +0x01, +0xe8, +0x9b, +0x5f, +0x7d, +0xa0, +0x20, +0x14, +0x4e, +0x47, +0xbc, +0x82, +0xc3, +0xce, +0xb3, +0x60, +0xdc, +0xef, +0x42, +0xed, +0x02, +0x68, +0x64, +0x01, +0x7d, +0xbf, +0xef, +0x1b, +0xc3, +0xdd, +0xe4, +0xfa, +0xa4, +0x02, +0x3e, +0x7c, +0xf6, +0x60, +0x94, +0xd2, +0xa0, +0xbd, +0x58, +0xda, +0xff, +0xe7, +0x2a, +0x51, +0x6e, +0x30, +0x7a, +0x09, +0xac, +0x55, +0xce, +0xb5, +0xe0, +0x7a, +0x39, +0x2f, +0x53, +0x56, +0x25, +0xa2, +0x7c, +0x19, +0xec, +0x07, +0x07, +0xf6, +0xbb, +0xc3, +0xb3, +0xe1, +0x3a, +0x35, +0x9c, +0x45, +0xc0, +0x24, +0x71, +0x39, +0x60, +0xf8, +0x96, +0xa5, +0xe7, +0x5c, +0x6a, +0x73, +0x62, +0xf8, +0xda, +0x20, +0xef, +0x08, +0xe6, +0xa9, +0x8d, +0xb7, +0x59, +0x45, +0x46, +0xc1, +0x66, +0x18, +0x6a, +0x25, +0x62, +0xdd, +0x11, +0xa1, +0xf1, +0x0e, +0xdd, +0x4e, +0x2b, +0x43, +0x52, +0x0b, +0xd7, +0xe5, +0x67, +0xce, +0x0b, +0xdf, +0x08, +0x84, +0x23, +0x73, +0x99, +0xf3, +0x77, +0x3a, +0x90, +0x24, +0x53, +0xee, +0xf9, +0x0c, +0x6f, +0xf5, +0x0a, +0x6f, +0xa5, +0xc2, +0xda, +0x3a, +0x78, +0x23, +0x32, +0x4d, +0x9b, +0x8e, +0x3c, +0x4c, +0x81, +0x69, +0x7f, +0x96, +0xcc, +0xe0, +0xe2, +0x05, +0xfc, +0xec, +0x76, +0x7f, +0x85, +0x0c, +0xec, +0x32, +0x73, +0xb9, +0x01, +0xa8, +0x1c, +0x00, +0x69, +0x0e, +0x0e, +0xe0, +0xd4, +0x49, +0x05, +0xc6, +0xb9, +0x78, +0x6c, +0x94, +0x3c, +0x7a, +0xab, +0x32, +0x42, +0x07, +0x31, +0xce, +0xc9, +0xcd, +0x0d, +0xc4, +0xce, +0x36, +0x02, +0x96, +0xd5, +0x51, +0x58, +0x64, +0xd5, +0x01, +0xe0, +0x72, +0xcf, +0x81, +0xf0, +0xc3, +0xd4, +0x75, +0x96, +0x7a, +0x4d, +0x6a, +0x6a, +0xa0, +0x40, +0xf2, +0xce, +0x1e, +0x01, +0xfb, +0xa5, +0x97, +0xfa, +0x3c, +0xdd, +0x7c, +0xac, +0xf3, +0x5c, +0xf4, +0xe8, +0xcc, +0x8e, +0x4d, +0xb4, +0xc7, +0xd6, +0x47, +0xc5, +0x4d, +0x24, +0x44, +0x6a, +0x43, +0x33, +0x38, +0x84, +0x69, +0xbc, +0x1d, +0x62, +0xa0, +0x98, +0xa5, +0x82, +0xfc, +0x5a, +0x71, +0x53, +0xbe, +0x78, +0xa0, +0x09, +0xc6, +0xc2, +0x10, +0x30, +0xd3, +0xea, +0xcb, +0x1f, +0x61, +0x8c, +0xd3, +0xd9, +0x28, +0x5b, +0xf5, +0x1a, +0x9d, +0xaf, +0x42, +0xbf, +0xbe, +0xca, +0x2e, +0x88, +0xe4, +0x8f, +0xcd, +0x16, +0x34, +0xc5, +0x77, +0x50, +0x37, +0xd1, +0xe9, +0x2a, +0xbf, +0xd1, +0x77, +0x56, +0xed, +0x74, +0xa3, +0x1c, +0x26, +0x08, +0x4d, +0x36, +0xf3, +0x79, +0xd5, +0x13, +0xb4, +0xce, +0x62, +0xdc, +0x47, +0xd5, +0xa8, +0xee, +0xe9, +0x7c, +0x42, +0x02, +0xf9, +0x9e, +0xe5, +0x15, +0xeb, +0xd9, +0xdc, +0x2c, +0xf5, +0x50, +0x3d, +0x58, +0xa8, +0x98, +0x74, +0xb4, +0x22, +0x85, +0x71, +0x91, +0xb9, +0x3c, +0xc4, +0x4d, +0x1c, +0xa5, +0xc8, +0x86, +0xd6, +0xb0, +0xe6, +0x1f, +0xbc, +0x29, +0x2b, +0xaf, +0x6f, +0x2d, +0xfc, +0x7a, +0x8e, +0x6f, +0xca, +0x34, +0x2e, +0x64, +0xda, +0xb4, +0x0f, +0xc9, +0x2d, +0x28, +0x2f, +0x36, +0xc9, +0xd9, +0x53, +0xc9, +0xd0, +0x81, +0x03, +0xdd, +0x08, +0xd5, +0xfc, +0xf9, +0x59, +0x45, +0x76, +0x7e, +0x20, +0xbf, +0x4f, +0x55, +0x00, +0x48, +0x45, +0x9d, +0x1c, +0x2f, +0x09, +0x20, +0xe9, +0x27, +0xcf, +0x00, +0x6f, +0x48, +0x0d, +0xf0, +0x00, +0x18, +0x58, +0x4e, +0x8e, +0x51, +0x61, +0x8f, +0xa9, +0x15, +0x43, +0x72, +0xab, +0xdc, +0x1a, +0xe6, +0x7d, +0xfd, +0x21, +0x94, +0xf4, +0x5c, +0x2e, +0x88, +0x81, +0x82, +0xf3, +0x45, +0xd4, +0x48, +0x92, +0x5d, +0x76, +0xf6, +0x97, +0xcf, +0x0a, +0x8a, +0xb1, +0x4d, +0x2f, +0x31, +0x04, +0x26, +0xee, +0x15, +0x77, +0x88, +0x91, +0x06, +0xcb, +0x31, +0x8f, +0x2a, +0xe5, +0x4a, +0x54, +0x22, +0x9e, +0x12, +0xf5, +0x83, +0x99, +0x12, +0xe4, +0x10, +0x77, +0x59, +0x15, +0x59, +0x51, +0xa8, +0xa3, +0xc3, +0x8f, +0xe9, +0x33, +0xeb, +0xb9, +0x0c, +0xb4, +0xdf, +0x7c, +0xc2, +0xdc, +0xe0, +0x65, +0xfc, +0xd2, +0x70, +0x84, +0xaa, +0x81, +0xac, +0xd1, +0x29, +0xf6, +0x9a, +0x78, +0x8b, +0xe1, +0x0c, +0x12, +0x3b, +0x70, +0x96, +0x32, +0xba, +0xf7, +0x6a, +0x04, +0xa2, +0x74, +0xb0, +0x77, +0xd7, +0x69, +0x69, +0x43, +0x27, +0xc4, +0x8b, +0x9c, +0x1a, +0x26, +0x74, +0xfd, +0x8e, +0x42, +0x7c, +0x35, +0x84, +0x51, +0xdb, +0x88, +0x2d, +0x83, +0x1e, +0x2e, +0x8b, +0xb5, +0xfa, +0x15, +0xc7, +0xaa, +0x43, +0x2e, +0x50, +0x91, +0x6a, +0xdb, +0xf9, +0x26, +0xd8, +0x76, +0xa3, +0xe8, +0x54, +0x11, +0x34, +0xd2, +0x8a, +0x2f, +0x50, +0x0a, +0x3e, +0x19, +0x5e, +0x76, +0x9b, +0x68, +0x52, +0x83, +0x6e, +0x7a, +0x04, +0x59, +0x5c, +0xbf, +0x77, +0x20, +0xd2, +0x23, +0xb0, +0x9a, +0xe2, +0xe3, +0x27, +0x32, +0x03, +0xa2, +0xaf, +0xbe, +0xf6, +0xf5, +0x87, +0x4a, +0x60, +0xd8, +0xca, +0x20, +0xf7, +0x40, +0x3d, +0xae, +0x23, +0x9e, +0xdf, +0x68, +0x2a, +0x30, +0x0a, +0x6c, +0xb0, +0x01, +0x9c, +0xee, +0x4f, +0xe4, +0x4f, +0x5f, +0xb1, +0x11, +0xce, +0x28, +0xaf, +0x27, +0xb5, +0x13, +0xa1, +0x76, +0xb9, +0x81, +0xa5, +0xbb, +0x38, +0x5a, +0xa1, +0xd7, +0x27, +0x00, +0x8a, +0x3e, +0x72, +0xb4, +0x9e, +0x31, +0x61, +0x27, +0x71, +0x63, +0xa7, +0xc0, +0xdd, +0x48, +0x50, +0x0f, +0x1a, +0xd8, +0x7e, +0x97, +0x52, +0xc5, +0xf1, +0xfe, +0x28, +0x5c, +0x4c, +0xfa, +0xd6, +0xd4, +0x85, +0x6d, +0x40, +0x02, +0x82, +0x46, +0x6c, +0x3e, +0x5d, +0xf0, +0x2f, +0x31, +0xa6, +0x65, +0x91, +0x83, +0xa6, +0x20, +0xdc, +0xc2, +0xb3, +0xbe, +0x85, +0x15, +0xaa, +0x84, +0x72, +0xc5, +0x2e, +0xa0, +0x2c, +0xc2, +0x5d, +0xf4, +0xa9, +0x64, +0x60, +0xc4, +0x07, +0xef, +0xee, +0x0b, +0x91, +0x54, +0x0a, +0x5e, +0xb5, +0xb8, +0xa9, +0x49, +0xbc, +0xb0, +0x0f, +0x0b, +0x4c, +0xf0, +0x50, +0x11, +0xac, +0x11, +0x38, +0xfe, +0x91, +0x1d, +0x7e, +0x8d, +0x39, +0x29, +0x48, +0x66, +0x31, +0x94, +0xaf, +0xb5, +0xbf, +0xe1, +0xd2, +0x5d, +0x44, +0x83, +0xb4, +0xe2, +0x7c, +0xf2, +0x81, +0x79, +0x8e, +0xf2, +0x95, +0x17, +0xe7, +0xfa, +0x5f, +0xa4, +0xf0, +0xb4, +0x32, +0xcd, +0xaa, +0xad, +0xdb, +0x9b, +0x9c, +0x1c, +0x20, +0x75, +0xf0, +0xae, +0x37, +0xc5, +0x3d, +0x4d, +0x86, +0x31, +0xc6, +0xdd, +0x2e, +0x67, +0x97, +0x43, +0x99, +0x67, +0x4d, +0x0b, +0x95, +0x49, +0xf1, +0x6c, +0xb0, +0xe3, +0x8a, +0xfa, +0x26, +0x7b, +0x44, +0x11, +0x35, +0x79, +0x7e, +0xa9, +0x44, +0x65, +0x94, +0x66, +0xf7, +0x93, +0x7b, +0x4b, +0xd8, +0x6c, +0xc1, +0x7a, +0xca, +0x6a, +0x82, +0x8c, +0xc0, +0xa0, +0x5d, +0xfc, +0x9c, +0x1c, +0x51, +0x62, +0x5d, +0x21, +0x94, +0xda, +0x05, +0x9a, +0x25, +0x69, +0x13, +0x37, +0x19, +0x06, +0xd1, +0x59, +0x03, +0xca, +0x10, +0x1b, +0x0a, +0xfb, +0x73, +0xfb, +0xf5, +0xfe, +0x5b, +0xd6, +0x55, +0x02, +0x2f, +0x1d, +0x4c, +0xd1, +0x8f, +0x2a, +0xba, +0x75, +0xc8, +0xb1, +0x0e, +0xc4, +0x4f, +0x7c, +0xc2, +0x2b, +0x69, +0x90, +0x99, +0x83, +0x98, +0x23, +0x3a, +0x4e, +0x89, +0x8f, +0x50, +0xff, +0x8e, +0x0a, +0x75, +0xf4, +0xc9, +0x3d, +0x10, +0x2d, +0xea, +0x79, +0x9c, +0xf7, +0x9f, +0xac, +0x94, +0xdd, +0x5d, +0xf9, +0xb6, +0x6d, +0x82, +0x67, +0x95, +0x6e, +0xae, +0xe2, +0x2c, +0x7e, +0x01, +0x2c, +0xc3, +0x56, +0x6c, +0x75, +0x59, +0xf2, +0xca, +0x2e, +0xa4, +0x4e, +0xb1, +0x83, +0x4f, +0x51, +0x39, +0x76, +0x5e, +0x78, +0x2a, +0x2b, +0x48, +0xc2, +0x95, +0x63, +0x1a, +0xda, +0xb3, +0xbf, +0x1a, +0x5e, +0x78, +0x4b, +0x95, +0x47, +0x0b, +0x80, +0xf8, +0xf4, +0x9b, +0x27, +0x03, +0x21, +0x2f, +0x49, +0x66, +0xdf, +0x7d, +0xad, +0x9b, +0xa9, +0xf3, +0x26, +0xe2, +0x3c, +0x17, +0x9f, +0x29, +0xc8, +0xa3, +0x23, +0x4a, +0x50, +0xa1, +0x74, +0x9c, +0xda, +0x01, +0xfd, +0xd8, +0xe7, +0xed, +0x72, +0x70, +0x98, +0x6c, +0x3a, +0xd8, +0x88, +0xd7, +0xc6, +0x81, +0x89, +0xf1, +0xa4, +0x3d, +0x4c, +0x3c, +0x57, +0xe5, +0xfa, +0xe3, +0x6f, +0x04, +0x01, +0xe4, +0x7f, +0xba, +0x2b, +0x05, +0x6e, +0xc8, +0xe9, +0x52, +0x49, +0x8a, +0x7a, +0x5c, +0xca, +0x0f, +0xdb, +0x58, +0x1f, +0xe4, +0xbd, +0xf5, +0x8c, +0xe5, +0x7c, +0xa0, +0xb3, +0xd9, +0x00, +0x95, +0xe4, +0x05, +0xe4, +0x3a, +0x4b, +0x67, +0xf4, +0x51, +0xca, +0x26, +0x9e, +0x55, +0xdd, +0xbd, +0x14, +0x2f, +0x50, +0x29, +0x0f, +0x02, +0xb2, +0x42, +0x0a, +0xc3, +0x45, +0xa2, +0x67, +0xef, +0x48, +0x6b, +0x5c, +0x58, +0xc3, +0x5e, +0xc3, +0x10, +0x87, +0x1e, +0x14, +0x2d, +0x07, +0x1a, +0x55, +0xe1, +0x46, +0x6f, +0x9e, +0x0f, +0xa5, +0x7e, +0xb1, +0x79, +0xe1, +0x21, +0x09, +0x52, +0xc2, +0x50, +0xcc, +0xe5, +0x05, +0xf5, +0x57, +0x88, +0x21, +0x2c, +0xd7, +0xc1, +0x2b, +0x77, +0x44, +0xc2, +0x4e, +0x3f, +0xf0, +0xea, +0x7f, +0x6a, +0x4f, +0x97, +0xe1, +0x36, +0xaf, +0x7d, +0x1f, +0x12, +0xdc, +0x65, +0x3b, +0xcd, +0x07, +0xc2, +0x6f, +0xdf, +0xe7, +0x39, +0x18, +0x54, +0xd3, +0x3a, +0x64, +0x84, +0x34, +0x08, +0xc8, +0x3e, +0xa8, +0xdf, +0x4e, +0xc8, +0x0d, +0x48, +0x61, +0x2b, +0x9a, +0xdc, +0xe8, +0xfa, +0xf0, +0x37, +0x9b, +0xaf, +0x06, +0x8b, +0x78, +0x0f, +0x67, +0xc2, +0x5b, +0x6b, +0xa4, +0x01, +0x40, +0x51, +0xa9, +0x4a, +0x9b, +0x01, +0xd2, +0xde, +0x31, +0x1d, +0x46, +0xe4, +0xbc, +0xd0, +0x93, +0x57, +0x16, +0x91, +0x79, +0x67, +0x12, +0x3b, +0x83, +0x95, +0x21, +0x2f, +0xec, +0xf1, +0x62, +0x4f, +0x24, +0x5b, +0xdc, +0xfe, +0x19, +0xec, +0x45, +0x1c, +0x9f, +0x8d, +0x1c, +0x7c, +0x13, +0x50, +0x54, +0x29, +0x6b, +0xe7, +0x03, +0x73, +0x29, +0x5f, +0xd8, +0x5e, +0x05, +0x51, +0x94, +0xb7, +0x08, +0x40, +0xd7, +0x8f, +0x09, +0x9b, +0xc6, +0x11, +0x5c, +0x86, +0xae, +0xfa, +0x8b, +0xb4, +0x3b, +0x86, +0x9d, +0x3d, +0x7c, +0x0a, +0x1b, +0x46, +0x28, +0x58, +0x2f, +0xd5, +0xda, +0x8f, +0x10, +0x9f, +0xa5, +0x86, +0x76, +0xec, +0x0f, +0x07, +0x16, +0xf0, +0xc6, +0x82, +0x96, +0xfa, +0xfa, +0xe2, +0xd3, +0xf7, +0x57, +0xd0, +0x59, +0x46, +0x63, +0x69, +0x3e, +0xb7, +0xdc, +0x2e, +0xaf, +0x0a, +0x5d, +0xb3, +0xe4, +0x7a, +0x1c, +0x07, +0x04, +0x5e, +0x24, +0x58, +0x08, +0x92, +0x21, +0xc3, +0xd0, +0xb8, +0x16, +0x30, +0x1d, +0xd3, +0x1f, +0xba, +0xb1, +0xc5, +0xf1, +0xfd, +0x32, +0xfe, +0x1e, +0x5e, +0x07, +0xb4, +0xea, +0x1c, +0xce, +0xa0, +0x7a, +0x1a, +0x87, +0x94, +0xaa, +0xbf, +0xc0, +0x15, +0x32, +0x12, +0xd4, +0x49, +0xf4, +0xa0, +0xd9, +0x2d, +0xf4, +0x36, +0xc2, +0x67, +0xb7, +0x56, +0x1d, +0xa4, +0x1a, +0xb0, +0xc8, +0x84, +0x89, +0x38, +0xf6, +0x35, +0x9f, +0x1a, +0x47, +0x81, +0xc9, +0x6a, +0x2a, +0xb3, +0x24, +0x19, +0x10, +0xdc, +0xfd, +0x78, +0x7c, +0xc8, +0xbf, +0x85, +0xe3, +0xb4, +0x4a, +0x9a, +0x77, +0x33, +0x81, +0x7e, +0x6d, +0xa1, +0x29, +0xf8, +0xd5, +0x7a, +0x36, +0x09, +0xb3, +0x6b, +0xa3, +0x92, +0x6a, +0x45, +0x89, +0xbf, +0x79, +0x8d, +0xda, +0x5d, +0x06, +0xce, +0x65, +0xf2, +0xd7, +0x9d, +0xca, +0xd7, +0x0a, +0x0a, +0x68, +0x56, +0x20, +0x02, +0xd2, +0xa0, +0x5d, +0xbc, +0xb9, +0x79, +0xf2, +0xf9, +0x5b, +0xb3, +0xac, +0xe7, +0x72, +0x55, +0xda, +0x94, +0xf1, +0x4a, +0x1f, +0xea, +0xbb, +0x75, +0x3a, +0x58, +0x9c, +0x7b, +0xa4, +0xbc, +0x67, +0x23, +0xeb, +0xb8, +0xd1, +0x59, +0x1c, +0x38, +0xb8, +0x78, +0x07, +0xd8, +0x5c, +0xca, +0x30, +0xa5, +0xc8, +0x8d, +0xbf, +0xa2, +0x78, +0x6c, +0x3a, +0x4b, +0x3b, +0x5f, +0x0c, +0x39, +0xbb, +0x68, +0x67, +0xa5, +0x78, +0x25, +0x4e, +0x35, +0x81, +0xc9, +0xee, +0xfb, +0x90, +0x7a, +0x14, +0x9d, +0xfc, +0x9a, +0x1d, +0x9d, +0xb7, +0xae, +0x90, +0x69, +0x79, +0x89, +0xba, +0x9a, +0x40, +0x4b, +0x60, +0x74, +0x58, +0x16, +0x2c, +0xc6, +0xbf, +0x5d, +0x16, +0x46, +0xe7, +0x81, +0x63, +0x08, +0xbf, +0x43, +0xdf, +0xaa, +0x70, +0xac, +0x82, +0x63, +0x08, +0x82, +0x44, +0x9b, +0xdd, +0xce, +0x90, +0x0e, +0xff, +0x52, +0xe7, +0x65, +0x9a, +0xdb, +0x51, +0x98, +0xfa, +0xd4, +0xc9, +0x2f, +0x25, +0x28, +0x82, +0x65, +0x34, +0x70, +0x23, +0x32, +0x2b, +0x53, +0x59, +0xaa, +0x9d, +0xec, +0x57, +0x7d, +0x26, +0x59, +0x2f, +0xa2, +0xd4, +0xf8, +0x68, +0x27, +0x81, +0xb5, +0x61, +0x19, +0x8e, +0x24, +0x97, +0x7c, +0x41, +0xe4, +0xf7, +0x53, +0xf2, +0xe0, +0x40, +0xe3, +0xff, +0x29, +0x3f, +0x01, +0x38, +0x2c, +0x4e, +0x4d, +0xa8, +0xbe, +0x5b, +0x48, +0x2e, +0xc9, +0xf9, +0x7d, +0x6d, +0xed, +0x14, +0x6c, +0x57, +0x4f, +0x03, +0x9e, +0xb7, +0xf6, +0x35, +0x9a, +0x92, +0x73, +0xf6, +0xc1, +0x18, +0xa6, +0x3f, +0xaf, +0x9e, +0xca, +0x0d, +0x9b, +0xa6, +0x25, +0x3b, +0x5c, +0x1b, +0x6d, +0x6b, +0x9e, +0x58, +0x9b, +0xbf, +0x29, +0x98, +0xc5, +0x00, +0xc2, +0xd8, +0x92, +0xa6, +0xdf, +0xeb, +0x29, +0x47, +0x3a, +0xc4, +0xd8, +0x08, +0xc7, +0xbb, +0xc5, +0x63, +0x49, +0xfe, +0x7d, +0x9b, +0xb3, +0x74, +0x73, +0xfc, +0xaa, +0x18, +0xd3, +0x01, +0xfd, +0xa4, +0xad, +0x72, +0x85, +0xa5, +0x58, +0x13, +0x5b, +0x7a, +0xcd, +0x3e, +0x9e, +0x56, +0x73, +0x00, +0x75, +0xdf, +0x2a, +0xe4, +0xc9, +0xff, +0x17, +0xfd, +0x0c, +0x36, +0xd9, +0xa5, +0xff, +0xe6, +0x13, +0xe6, +0xcb, +0x0f, +0xa4, +0x80, +0xa8, +0x15, +0xd2, +0x85, +0x30, +0x2c, +0x4a, +0xe5, +0xb5, +0xf6, +0xdc, +0x69, +0x76, +0x5f, +0xbc, +0x4c, +0x98, +0xab, +0x1d, +0x71, +0x56, +0x84, +0x49, +0x1d, +0x8e, +0x13, +0x1e, +0x85, +0x80, +0x5b, +0x74, +0xda, +0xe4, +0xbd, +0x38, +0xf9, +0x88, +0x5b, +0x2a, +0x43, +0x45, +0x5d, +0x7d, +0xb6, +0x40, +0x48, +0xd2, +0xec, +0x92, +0x09, +0xbf, +0xd4, +0xdc, +0xf0, +0x3e, +0x36, +0x7e, +0xd4, +0x70, +0x9b, +0x80, +0xfc, +0x2e, +0x8f, +0xa5, +0x0b, +0x88, +0xee, +0x92, +0x92, +0xc0, +0x8d, +0x4f, +0xbc, +0xdb, +0xf1, +0x2c, +0x60, +0x9f, +0xe1, +0xc6, +0x1f, +0x4b, +0xff, +0x5d, +0x13, +0x7c, +0x34, +0x8d, +0x65, +0x5d, +0xec, +0xc6, +0xbf, +0xe8, +0x82, +0xb2, +0x46, +0x42, +0x18, +0x7f, +0xce, +0xc8, +0xc3, +0x46, +0xca, +0xaf, +0x75, +0xaf, +0x3e, +0xf0, +0x14, +0xbb, +0x33, +0x70, +0x42, +0x9b, +0x35, +0xb9, +0x12, +0x90, +0xf4, +0xa9, +0xef, +0xe6, +0xf6, +0x8a, +0xb1, +0x03, +0x48, +0x5a, +0xb0, +0x00, +0x35, +0xaf, +0xc4, +0x66, +0xa6, +0x29, +0x14, +0x2d, +0xe6, +0xfd, +0x29, +0x84, +0xab, +0x3f, +0x07, +0x11, +0xd8, +0x4d, +0x1e, +0x01, +0x79, +0x67, +0xcd, +0x3c, +0xe3, +0x20, +0x61, +0x95, +0x5f, +0x4a, +0x09, +0xb3, +0x60, +0x13, +0xb6, +0x30, +0xba, +0x1f, +0xb7, +0x28, +0xa8, +0x28, +0xd9, +0xdd, +0x1f, +0x48, +0x7e, +0x93, +0xce, +0x32, +0x0f, +0x14, +0xbc, +0x76, +0x0a, +0x6e, +0x7b, +0x94, +0x7f, +0xb4, +0x03, +0x25, +0x62, +0xc2, +0xe6, +0xa5, +0x62, +0xf5, +0xb1, +0x30, +0x2a, +0x08, +0xd6, +0x13, +0x82, +0x65, +0xfa, +0x99, +0xc6, +0xc8, +0x8b, +0x6e, +0xc5, +0x1c, +0x50, +0xe6, +0xb9, +0x0c, +0xc2, +0x1b, +0xf6, +0x93, +0xe3, +0x9a, +0x35, +0x6e, +0x2f, +0x90, +0x01, +0xd3, +0xf2, +0x39, +0x34, +0x73, +0x4c, +0x38, +0x5c, +0xce, +0x33, +0x3d, +0x84, +0x88, +0xa4, +0xb9, +0x7a, +0x84, +0x58, +0x22, +0xe9, +0x37, +0xd5, +0xfd, +0x57, +0x35, +0x21, +0x2a, +0x02, +0x04, +0x79, +0x02, +0x1c, +0xd3, +0x3b, +0x1d, +0x31, +0xf1, +0x73, +0x6e, +0x5b, +0xb4, +0x95, +0xdd, +0x63, +0xe5, +0x1c, +0xae, +0xfb, +0x80, +0x62, +0x90, +0x5e, +0x27, +0x1b, +0x87, +0xc0, +0xa1, +0xf6, +0xbb, +0x19, +0x6a, +0x6d, +0xe4, +0xb3, +0x6b, +0xb7, +0x36, +0x99, +0xc9, +0x83, +0xaa, +0xbc, +0x95, +0x56, +0x80, +0x81, +0xc4, +0xcb, +0x0c, +0x2c, +0x01, +0x69, +0x12, +0xf6, +0xe3, +0x65, +0x47, +0x53, +0x40, +0x23, +0xf2, +0x40, +0xf3, +0x49, +0x11, +0xaf, +0x5e, +0x18, +0x6a, +0x16, +0x51, +0x8a, +0xa9, +0x41, +0x2a, +0x4d, +0xa7, +0xd2, +0xb8, +0xdb, +0xc2, +0x37, +0x94, +0x7e, +0x27, +0xfb, +0x57, +0x22, +0x28, +0x36, +0xe2, +0x0f, +0xe6, +0x10, +0x51, +0xd0, +0x39, +0xd9, +0x5f, +0x35, +0x5e, +0x52, +0xe8, +0xbe, +0xb4, +0x78, +0x80, +0xca, +0xdb, +0xb5, +0xe0, +0x79, +0x16, +0x19, +0x18, +0x53, +0x23, +0xb6, +0xa3, +0xee, +0x6a, +0xdf, +0x66, +0x0e, +0xd3, +0x05, +0x5f, +0x4a, +0xe9, +0x9e, +0x51, +0xa9, +0xf4, +0xef, +0x55, +0xc1, +0x1a, +0x28, +0x6c, +0xf8, +0x09, +0x8b, +0xb8, +0xc7, +0x14, +0x7c, +0x86, +0x8c, +0xfd, +0xd4, +0x92, +0xd7, +0x05, +0x16, +0x67, +0x53, +0x95, +0x8d, +0xe0, +0xe4, +0x70, +0x76, +0x28, +0xac, +0x95, +0x4a, +0xc6, +0x66, +0x6b, +0xfd, +0xd2, +0x86, +0x70, +0xc5, +0x08, +0x95, +0x5c, +0x69, +0xdb, +0xca, +0x85, +0x3e, +0xef, +0xc1, +0xcb, +0xb7, +0x85, +0x4f, +0x65, +0x28, +0x6a, +0x29, +0x6e, +0xa9, +0x51, +0x4f, +0xcf, +0x2a, +0x56, +0xfc, +0xc7, +0x6e, +0x69, +0x00, +0xaf, +0x50, +0xb8, +0x9b, +0xe1, +0x02, +0xa7, +0xab, +0x25, +0xf1, +0xbd, +0x55, +0x78, +0x60, +0x6d, +0x37, +0x1d, +0x7c, +0xdb, +0xbc, +0x0f, +0x70, +0xca, +0xb8, +0x15, +0x1f, +0x26, +0x12, +0x0b, +0x56, +0x7c, +0xb2, +0x48, +0xa7, +0xee, +0x73, +0x1a, +0xb5, +0xb9, +0x2a, +0xfb, +0x37, +0x04, +0x4e, +0x44, +0x8f, +0x8c, +0xc8, +0xd1, +0x2b, +0x83, +0x6a, +0xee, +0xf7, +0x8a, +0x2c, +0x9e, +0x48, +0xe5, +0xf3, +0xad, +0xdf, +0xf2, +0x9f, +0x31, +0xb5, +0x4c, +0xb8, +0x91, +0x79, +0xb6, +0xc0, +0x92, +0x42, +0x01, +0x05, +0x63, +0xa7, +0xbd, +0x2b, +0x31, +0x43, +0x07, +0xfe, +0x1c, +0x40, +0xec, +0xc9, +0x24, +0x0b, +0x7d, +0x01, +0xaa, +0x43, +0x0a, +0x7e, +0xae, +0xbf, +0xb5, +0x74, +0x3a, +0x91, +0x21, +0x23, +0x35, +0xa0, +0xeb, +0xd6, +0x65, +0x35, +0xce, +0x81, +0x83, +0x4d, +0x46, +0x70, +0x44, +0x33, +0x9e, +0x37, +0xa0, +0xe0, +0x76, +0x9f, +0x02, +0x4e, +0xcd, +0x00, +0xb0, +0x4c, +0x9c, +0xf1, +0x33, +0x4b, +0xa0, +0xc5, +0x88, +0x46, +0x03, +0xa0, +0x99, +0x81, +0xb0, +0x43, +0x02, +0xb4, +0xe9, +0x45, +0xa8, +0x37, +0x85, +0x64, +0x7e, +0xf9, +0x9a, +0x8f, +0xdd, +0x3f, +0xb7, +0xea, +0xe5, +0x1d, +0x9d, +0x4f, +0x5e, +0x80, +0x14, +0xe1, +0xcc, +0xd5, +0x2e, +0x57, +0xab, +0x79, +0x0c, +0xb5, +0xc5, +0x71, +0x18, +0xc0, +0x80, +0xe4, +0x87, +0x7e, +0xbc, +0x5c, +0x43, +0x8a, +0x7a, +0x7b, +0xbe, +0x40, +0x76, +0xf7, +0x04, +0x23, +0xf3, +0xd6, +0x76, +0xd1, +0x82, +0x93, +0x04, +0xe1, +0x29, +0xf7, +0x60, +0xd3, +0x6d, +0xee, +0x85, +0xc9, +0x23, +0xfe, +0x9a, +0x93, +0xe6, +0x61, +0xbd, +0x3d, +0x3d, +0x7a, +0xf8, +0x48, +0x1c, +0xc3, +0x6c, +0x61, +0x42, +0x66, +0x9b, +0x70, +0x56, +0xdc, +0x97, +0xcd, +0x75, +0xb3, +0x83, +0x40, +0xfe, +0x2b, +0x81, +0xa6, +0x12, +0xd3, +0x84, +0x0b, +0x9f, +0x13, +0x65, +0x91, +0xb0, +0x3e, +0x75, +0x13, +0x0c, +0xd5, +0x62, +0x81, +0xed, +0x06, +0xf8, +0x96, +0xf9, +0x48, +0xea, +0x98, +0x12, +0x4a, +0x6e, +0xf3, +0x57, +0xf4, +0x09, +0x74, +0x31, +0xda, +0x98, +0x1e, +0xdb, +0x43, +0x90, +0xb6, +0xdb, +0x78, +0x88, +0xdd, +0x39, +0xec, +0xf2, +0xe5, +0xc2, +0x37, +0x55, +0x9f, +0xbb, +0xb4, +0x6c, +0x1e, +0xd8, +0x9e, +0x46, +0x79, +0xa7, +0xa1, +0x89, +0x41, +0xe0, +0xf6, +0xa6, +0x1c, +0xf2, +0x12, +0x0d, +0xee, +0xa2, +0x3b, +0xcc, +0x35, +0x97, +0x44, +0x6b, +0x56, +0xac, +0xb6, +0xdf, +0x2a, +0x70, +0xde, +0xf2, +0xf5, +0xcc, +0xa5, +0xc5, +0xb8, +0x3b, +0x32, +0x62, +0xcc, +0xb3, +0xc9, +0x5f, +0x32, +0x6a, +0xe5, +0x85, +0x03, +0x2b, +0xe2, +0x73, +0x4f, +0x00, +0xba, +0x21, +0x6e, +0x51, +0xc6, +0x52, +0x5c, +0x34, +0x16, +0x58, +0x5c, +0xb2, +0xa5, +0x70, +0x84, +0xbd, +0xa8, +0x25, +0x7f, +0x04, +0x6b, +0xd2, +0xf8, +0x35, +0xa1, +0xc2, +0x3b, +0x52, +0xce, +0x29, +0x8e, +0x7a, +0x5f, +0x5b, +0x04, +0x3b, +0x0b, +0xc1, +0x0a, +0x25, +0xa8, +0xd1, +0x59, +0x83, +0x03, +0x90, +0x42, +0x74, +0xcb, +0xc8, +0x5b, +0x7f, +0x6a, +0x2b, +0xe5, +0x6a, +0x56, +0x77, +0xe2, +0x96, +0xce, +0xf3, +0x99, +0x36, +0xa5, +0x8b, +0x2f, +0xf5, +0x17, +0xf7, +0x92, +0x6c, +0xf5, +0x62, +0xd8, +0x25, +0x0f, +0xdb, +0xfc, +0x5e, +0x08, +0x4b, +0x27, +0x13, +0x2a, +0x9e, +0x4c, +0xb4, +0xb9, +0xc8, +0x62, +0x4e, +0xb3, +0x13, +0x0d, +0xc1, +0x2b, +0x8e, +0xff, +0x8e, +0x52, +0x2a, +0x52, +0x95, +0x5a, +0x09, +0xd8, +0x1e, +0x29, +0x75, +0x49, +0x54, +0xf3, +0x30, +0xd0, +0x2b, +0x03, +0xaf, +0x53, +0x6c, +0xbd, +0x8a, +0xdc, +0x44, +0x96, +0x68, +0x51, +0xf8, +0xd8, +0x7a, +0xf5, +0x35, +0xc3, +0x11, +0x5c, +0x5e, +0x9a, +0x70, +0xf4, +0xac, +0x5c, +0xee, +0x8d, +0x68, +0x53, +0x0a, +0xb7, +0x7b, +0x16, +0x6b, +0xb8, +0x08, +0xf0, +0xd6, +0xda, +0x29, +0x17, +0xf2, +0xf3, +0x35, +0x8e, +0x0d, +0x03, +0x98, +0xe8, +0x53, +0x33, +0x84, +0x79, +0xa3, +0x6c, +0x12, +0xc5, +0x76, +0x04, +0xd5, +0x5f, +0x58, +0xd5, +0xac, +0x57, +0x5f, +0xfe, +0xd0, +0x22, +0xf3, +0x26, +0x07, +0x99, +0x1c, +0xe8, +0xa0, +0x32, +0x1d, +0xdb, +0x0f, +0xcb, +0x10, +0x85, +0x6f, +0x22, +0x7a, +0x05, +0x66, +0x91, +0x2f, +0x58, +0xa0, +0xc4, +0xa6, +0x1c, +0xa9, +0x16, +0x07, +0x84, +0x82, +0x06, +0xc0, +0x65, +0xae, +0x78, +0xf2, +0xcf, +0xa3, +0x9e, +0xf0, +0xe0, +0x2e, +0x92, +0x6e, +0xb8, +0xa6, +0x77, +0x59, +0x01, +0x8d, +0xac, +0xfa, +0x5e, +0x29, +0xd0, +0xda, +0x5e, +0xeb, +0x8a, +0x5e, +0xd6, +0x9d, +0x6f, +0xda, +0x7d, +0x25, +0x30, +0xf9, +0xc0, +0xee, +0xdf, +0x14, +0x6d, +0x07, +0x09, +0x0a, +0x3f, +0xfb, +0x23, +0x40, +0x19, +0xe6, +0xd2, +0x3c, +0xd0, +0x69, +0x00, +0xf6, +0xaa, +0x4b, +0xf7, +0x0e, +0xc0, +0x0e, +0x4d, +0x81, +0x01, +0x44, +0xad, +0xf4, +0x33, +0x83, +0xc5, +0x45, +0x3f, +0x33, +0xea, +0x5c, +0x8d, +0xa7, +0xcd, +0xef, +0x5c, +0x40, +0xbc, +0xcf, +0x9b, +0x04, +0x30, +0x12, +0x90, +0x83, +0xbd, +0xc8, +0x1a, +0x9a, +0x8c, +0xc0, +0x56, +0x39, +0xc1, +0x7c, +0xd8, +0x66, +0xa8, +0x2c, +0x27, +0x4d, +0xe4, +0xa3, +0x84, +0xf6, +0xae, +0xbd, +0x41, +0x69, +0xa2, +0xc3, +0x69, +0xd3, +0x34, +0x06, +0x3a, +0x34, +0x15, +0x61, +0xa5, +0xb1, +0xae, +0xd0, +0xc5, +0x01, +0x28, +0x82, +0x7b, +0x67, +0x48, +0x5b, +0x71, +0xb8, +0xe3, +0x6b, +0xe3, +0x51, +0x35, +0xac, +0x18, +0x4b, +0x7a, +0x7e, +0x88, +0xc8, +0x6f, +0x8d, +0xe0, +0x68, +0x53, +0x70, +0x4f, +0x0a, +0xa8, +0x3e, +0x44, +0xea, +0xd2, +0x9e, +0x64, +0xb7, +0xca, +0x50, +0xc5, +0x08, +0x46, +0xc7, +0x21, +0x7f, +0x58, +0x8d, +0x3e, +0x56, +0x8b, +0xa9, +0x4d, +0x81, +0x95, +0xe2, +0x20, +0xc6, +0xd0, +0x2d, +0x68, +0x5e, +0x99, +0xbc, +0x97, +0x13, +0xbf, +0xd6, +0x48, +0x53, +0x7e, +0x41, +0x96, +0x3b, +0x44, +0x34, +0xea, +0xf6, +0xbe, +0x8a, +0x04, +0xdf, +0x40, +0xa5, +0x54, +0x37, +0xec, +0x9e, +0x2f, +0x37, +0x52, +0xa6, +0xcd, +0xdd, +0xc8, +0x6f, +0xdd, +0xc4, +0x04, +0x03, +0xa1, +0xe7, +0xb5, +0xa4, +0x1c, +0x7a, +0xc8, +0x0b, +0x8e, +0x32, +0x6b, +0x6f, +0xc7, +0x5b, +0x7a, +0x79, +0xe5, +0xf5, +0x48, +0x16, +0x75, +0xd8, +0xb4, +0x27, +0xc4, +0x92, +0xc3, +0xcb, +0x67, +0xf4, +0x46, +0xcf, +0xed, +0x2e, +0x38, +0x8c, +0x9d, +0xab, +0x78, +0x80, +0xc9, +0x15, +0x57, +0x77, +0x3e, +0x6d, +0xf7, +0xdc, +0x4f, +0x3d, +0xd5, +0x10, +0x7e, +0x64, +0x2c, +0xdd, +0x34, +0xb1, +0x8d, +0xde, +0x59, +0x54, +0x5a, +0x20, +0xcb, +0x7f, +0xe2, +0x67, +0x3d, +0xe3, +0x21, +0xd0, +0x36, +0xb3, +0x10, +0xdc, +0x78, +0xd5, +0xa1, +0x64, +0x69, +0x75, +0x8d, +0xce, +0x6e, +0xab, +0xa7, +0x79, +0x8c, +0xfb, +0x63, +0x57, +0xaf, +0x9b, +0xd0, +0x9b, +0x6b, +0x82, +0xf1, +0x18, +0x87, +0xc1, +0x8a, +0x93, +0xd4, +0xf8, +0xfa, +0xad, +0xe4, +0xfd, +0x73, +0xfc, +0x84, +0x41, +0x49, +0x21, +0x53, +0x22, +0xdf, +0xd9, +0x06, +0xd0, +0xf4, +0xa1, +0x1b, +0x04, +0xe5, +0x08, +0xf3, +0x3f, +0x2c, +0x1c, +0x73, +0x07, +0x0a, +0x82, +0xdc, +0xb9, +0x0b, +0x93, +0xdd, +0x22, +0xdb, +0xe7, +0x8d, +0xf7, +0xe1, +0xb9, +0x55, +0xa4, +0xeb, +0xd2, +0x19, +0x4b, +0x7c, +0x27, +0x14, +0x65, +0x5c, +0xaa, +0x31, +0x6f, +0x7c, +0xe8, +0xfc, +0x9d, +0xce, +0x6d, +0x0a, +0x76, +0xa9, +0x67, +0xab, +0x74, +0xdc, +0x75, +0x31, +0x58, +0x0d, +0x55, +0xb9, +0xe3, +0x01, +0x96, +0xf8, +0xcb, +0x9b, +0xb7, +0x0f, +0x80, +0x8f, +0x6c, +0x65, +0xc9, +0xe1, +0x4a, +0x4b, +0x8d, +0x36, +0xd9, +0x9a, +0xdd, +0xe4, +0xad, +0x00, +0x7c, +0x47, +0xb5, +0xb5, +0x8c, +0x34, +0x6a, +0x50, +0xb7, +0x88, +0xa1, +0x44, +0xae, +0xb3, +0x84, +0x13, +0x70, +0xa9, +0x42, +0x5f, +0xa8, +0xb5, +0xcb, +0x4f, +0xe9, +0x40, +0x53, +0x23, +0x59, +0x51, +0xc0, +0xf1, +0xc0, +0x41, +0xac, +0x7b, +0xf9, +0xbb, +0x91, +0xbe, +0xb5, +0xfb, +0x53, +0x9e, +0xf4, +0xe9, +0xad, +0xa1, +0x62, +0x83, +0x1b, +0x24, +0xb8, +0x48, +0xbb, +0x70, +0x29, +0x11, +0x11, +0xf1, +0x08, +0x1d, +0x46, +0x7a, +0xfb, +0x52, +0x3d, +0x97, +0x60, +0xcc, +0xb1, +0xed, +0xa2, +0xb8, +0xda, +0x9e, +0x44, +0x5d, +0x95, +0x53, +0x2b, +0xc4, +0x8a, +0x4c, +0xdd, +0xb3, +0xe2, +0x06, +0xa6, +0x81, +0x36, +0xf2, +0x36, +0xfd, +0x7d, +0x46, +0x20, +0x31, +0x73, +0xc8, +0xbc, +0x85, +0xa2, +0x89, +0x74, +0x56, +0xaf, +0x76, +0xff, +0xea, +0xae, +0xe3, +0x34, +0xbc, +0x72, +0xfe, +0x35, +0xf1, +0x69, +0x43, +0x6a, +0x83, +0x25, +0x3a, +0x97, +0x33, +0xc6, +0x35, +0x84, +0xa5, +0x6b, +0x58, +0x0c, +0xdc, +0xd2, +0xd6, +0x43, +0x0d, +0x43, +0x77, +0x11, +0xc0, +0x70, +0x67, +0x48, +0x2d, +0x26, +0x45, +0xfc, +0xe6, +0x0f, +0x33, +0x2a, +0xe7, +0xf3, +0xb6, +0x63, +0xf8, +0xc5, +0xad, +0x0b, +0x0e, +0x91, +0x9e, +0x61, +0x8a, +0x08, +0xde, +0x4b, +0x02, +0x8a, +0x96, +0xa1, +0xb7, +0x95, +0xda, +0x45, +0xd2, +0x17, +0x3c, +0xf4, +0xa4, +0xc6, +0x35, +0x52, +0xc4, +0x38, +0x76, +0x92, +0x66, +0x33, +0x77, +0x6f, +0x41, +0xdc, +0xc0, +0x02, +0x0d, +0x48, +0x0a, +0xfd, +0x9e, +0x0e, +0x42, +0x1b, +0xe3, +0xf6, +0xb1, +0xe7, +0x14, +0xc8, +0x58, +0xff, +0xdb, +0xe2, +0x5f, +0x08, +0xba, +0x98, +0xcc, +0x69, +0x06, +0xee, +0x4b, +0x9c, +0x4d, +0xfe, +0x5f, +0xb4, +0x3d, +0x71, +0x8f, +0x90, +0x63, +0xb5, +0xd5, +0x82, +0x96, +0xcc, +0x60, +0x5d, +0x5f, +0x3e, +0x78, +0x50, +0x89, +0x1f, +0xb9, +0x26, +0xbf, +0x40, +0xa2, +0x58, +0xae, +0xe5, +0x44, +0xd3, +0x4f, +0x6a, +0x5a, +0x9f, +0x62, +0x06, +0x63, +0x2f, +0x26, +0xf3, +0x6d, +0x07, +0xba, +0xc5, +0x44, +0x8b, +0x03, +0x10, +0x9b, +0xfd, +0x4a, +0x82, +0x45, +0x83, +0x80, +0x89, +0xeb, +0x71, +0x9d, +0x7b, +0xa8, +0x97, +0xef, +0xac, +0x63, +0x8e, +0x6a, +0x34, +0xca, +0x18, +0x82, +0x82, +0x8b, +0xcb, +0xa2, +0xe6, +0x1b, +0x78, +0xf7, +0xb0, +0x86, +0xaa, +0xec, +0x30, +0xcf, +0x31, +0x67, +0x80, +0x54, +0x45, +0x25, +0x37, +0x26, +0x20, +0xcc, +0xcd, +0xf9, +0x01, +0x7a, +0xcc, +0xcf, +0x9b, +0x72, +0x90, +0x6d, +0xf6, +0x1c, +0x2b, +0xaa, +0x9b, +0x9d, +0xb0, +0x03, +0x00, +0xcc, +0xd0, +0x62, +0x6f, +0xa9, +0x08, +0x9f, +0xca, +0x7a, +0x20, +0x0c, +0xdd, +0x5d, +0x0f, +0x84, +0xc1, +0x38, +0x6f, +0xc7, +0x74, +0xe9, +0x88, +0x03, +0xe1, +0x5a, +0x99, +0x92, +0xb3, +0xe8, +0x68, +0x28, +0xac, +0x10, +0x67, +0x76, +0xda, +0xfc, +0x96, +0xd7, +0xc5, +0xfc, +0x00, +0xb5, +0xeb, +0x03, +0x28, +0x15, +0x03, +0x04, +0x5d, +0xf4, +0x1b, +0xdc, +0x9c, +0xc1, +0xa4, +0x13, +0xff, +0xaf, +0x34, +0x01, +0x64, +0x0b, +0xe9, +0x75, +0xac, +0x0a, +0x5f, +0x3c, +0x87, +0x0f, +0xb5, +0xe2, +0x3f, +0x35, +0x71, +0xf4, +0xf1, +0x93, +0x88, +0x5e, +0x9c, +0xdd, +0x90, +0x8e, +0x79, +0x60, +0x75, +0x47, +0x71, +0x6a, +0x0a, +0xa5, +0x35, +0x4c, +0x0e, +0xa0, +0x29, +0x21, +0x83, +0x4a, +0xf4, +0x5e, +0xac, +0x65, +0xab, +0x89, +0x9a, +0x41, +0x4f, +0x14, +0x86, +0x7b, +0xe4, +0xe9, +0xf6, +0xe4, +0x83, +0xda, +0x2a, +0x04, +0x51, +0xf2, +0xd4, +0xae, +0xd0, +0x7f, +0x9f, +0xbc, +0xcd, +0xa5, +0xe1, +0xcb, +0x20, +0xd4, +0x4d, +0x05, +0x54, +0x66, +0x5d, +0x35, +0xce, +0x5a, +0xe1, +0xfb, +0xe7, +0xbd, +0xf6, +0x01, +0x34, +0x48, +0xe7, +0x60, +0x73, +0x58, +0xaf, +0x55, +0xf6, +0xce, +0x16, +0xd4, +0xf3, +0x9b, +0xaa, +0xe8, +0xd7, +0x6c, +0xea, +0x1b, +0x4d, +0x84, +0x1f, +0x4e, +0x0c, +0x44, +0x82, +0x72, +0x32, +0x22, +0xce, +0xc2, +0xfd, +0xba, +0x46, +0x3e, +0x73, +0x28, +0x5f, +0xa6, +0x19, +0x81, +0xe6, +0x45, +0xed, +0x01, +0xea, +0x74, +0xcd, +0x64, +0xa9, +0x5a, +0x31, +0x61, +0x82, +0xa1, +0x9f, +0x30, +0x9d, +0x0c, +0xa8, +0x57, +0x1c, +0xd5, +0x50, +0x4e, +0x96, +0x4b, +0x8c, +0xa7, +0xff, +0x59, +0x04, +0x29, +0x17, +0x3c, +0xec, +0x73, +0xfd, +0x81, +0x5e, +0x24, +0xd0, +0xf7, +0xd4, +0x8f, +0xfc, +0xa2, +0x9f, +0x21, +0x06, +0xf1, +0x4b, +0x2e, +0x2c, +0x3d, +0x2b, +0x86, +0x81, +0x3f, +0xa7, +0xdc, +0xa7, +0xf0, +0x1d, +0xd0, +0xaa, +0xbb, +0x96, +0x52, +0xae, +0xc9, +0xda, +0x22, +0x83, +0xc5, +0xfc, +0x0d, +0x1c, +0xa6, +0x49, +0x08, +0x74, +0x97, +0x65, +0xdb, +0x23, +0xf8, +0x2c, +0x57, +0xa9, +0x60, +0x82, +0x11, +0xc7, +0xa9, +0x86, +0x57, +0x1f, +0xb3, +0xc8, +0xd8, +0x51, +0x99, +0x3d, +0x32, +0x7f, +0x83, +0x81, +0xf6, +0x64, +0x7b, +0x29, +0x81, +0x65, +0x48, +0x1d, +0x28, +0x8c, +0x99, +0xf7, +0xe8, +0xf4, +0xfa, +0xd9, +0x92, +0x0c, +0xad, +0xb3, +0x73, +0x00, +0x01, +0x62, +0x2c, +0x0c, +0xb6, +0x73, +0xff, +0xb7, +0xf0, +0x8a, +0x6c, +0x56, +0x7f, +0xa0, +0x23, +0xf3, +0x28, +0xf1, +0x49, +0x1c, +0x9a, +0x50, +0xd6, +0x37, +0x58, +0xfe, +0x1d, +0x94, +0xf5, +0x95, +0x89, +0x65, +0xed, +0x3e, +0x47, +0x33, +0xa6, +0x36, +0xdc, +0x69, +0xc7, +0x85, +0x0b, +0xa2, +0x35, +0x17, +0x01, +0xa6, +0x7b, +0x31, +0xd7, +0xda, +0xed, +0x18, +0xdf, +0x07, +0x4a, +0xd2, +0x4a, +0xa1, +0x7f, +0x8a, +0xca, +0xe7, +0x8a, +0x65, +0xb8, +0x3d, +0x88, +0x9e, +0x61, +0x3b, +0xda, +0x05, +0x30, +0xae, +0x9f, +0x83, +0x16, +0x7f, +0xc9, +0x75, +0xdd, +0x0a, +0x17, +0xea, +0xc4, +0x27, +0xe8, +0x21, +0x42, +0x48, +0x2d, +0x91, +0xd4, +0xd8, +0x59, +0x03, +0x50, +0xb9, +0xbc, +0xc9, +0x55, +0x3d, +0xc2, +0x4b, +0x5f, +0xe3, +0xca, +0x28, +0xe4, +0x63, +0x69, +0xfd, +0xa5, +0xf6, +0x1b, +0xb2, +0x01, +0xb3, +0x76, +0x54, +0x95, +0xe2, +0x9d, +0xd4, +0x4b, +0xa6, +0x00, +0xb4, +0xe8, +0x41, +0x80, +0xe0, +0x0b, +0xfd, +0xe6, +0xbf, +0x0a, +0xb8, +0xba, +0xd2, +0x57, +0xff, +0xc7, +0x77, +0xe4, +0x13, +0xd9, +0x75, +0x8f, +0xee, +0x21, +0x66, +0x21, +0xd6, +0xb9, +0x1b, +0x42, +0x93, +0xe6, +0x27, +0x38, +0xcb, +0x10, +0x11, +0xdc, +0xe9, +0xfd, +0x5a, +0xf8, +0x00, +0xc2, +0x22, +0xa7, +0x95, +0xa7, +0xe9, +0x78, +0x3d, +0x31, +0x29, +0x7d, +0xfa, +0x9a, +0x6f, +0x5a, +0xe4, +0xb6, +0x9d, +0x45, +0x30, +0xbb, +0x7f, +0xff, +0xca, +0x56, +0xb2, +0xfd, +0x20, +0x57, +0xa9, +0xee, +0x2b, +0x9e, +0xd7, +0xfc, +0x53, +0x93, +0x1a, +0x60, +0x91, +0xf6, +0xf7, +0x0d, +0x7d, +0xf0, +0x04, +0x02, +0xd1, +0x8e, +0x26, +0x56, +0x1e, +0xda, +0xcc, +0x55, +0xc3, +0x30, +0x1b, +0x1c, +0xad, +0xa8, +0xa6, +0x71, +0xf2, +0x00, +0x94, +0xfe, +0x17, +0xf9, +0x04, +0x08, +0xdf, +0xb2, +0x07, +0x10, +0x33, +0x94, +0x28, +0xe7, +0x9a, +0x43, +0x1e, +0xad, +0x6f, +0x49, +0xfd, +0xd1, +0xec, +0xf9, +0x7b, +0x40, +0x8a, +0x9a, +0x49, +0xbc, +0xfe, +0x6b, +0xb4, +0xce, +0x0f, +0xf3, +0x85, +0xad, +0x90, +0xb7, +0xf1, +0x6e, +0x16, +0x41, +0xb5, +0xca, +0x5b, +0x1a, +0x8b, +0x74, +0xa7, +0x94, +0xdc, +0x8f, +0x21, +0xbd, +0x68, +0xbf, +0xf4, +0x8f, +0x67, +0x2b, +0x3a, +0xce, +0x97, +0x8f, +0x55, +0xa5, +0xf6, +0xf1, +0xf2, +0x6a, +0xed, +0x82, +0xe5, +0x91, +0xab, +0xf1, +0xb3, +0xbe, +0x9f, +0x9a, +0x99, +0x79, +0x4a, +0x25, +0x9f, +0x99, +0xad, +0x16, +0x0f, +0x5c, +0x0e, +0x66, +0x9d, +0x4c, +0x58, +0x21, +0xcf, +0xcc, +0x19, +0x09, +0x46, +0x76, +0x0b, +0x93, +0x46, +0x34, +0x85, +0xee, +0x58, +0x70, +0x24, +0x45, +0xc1, +0xed, +0x0f, +0xed, +0x1e, +0xb8, +0xc9, +0xf3, +0x22, +0x90, +0xe2, +0x26, +0x2f, +0x15, +0x19, +0xf2, +0xb8, +0x10, +0x26, +0x0b, +0x35, +0xde, +0xda, +0x29, +0x8f, +0x45, +0x46, +0xec, +0x00, +0xe6, +0x03, +0x4e, +0xb9, +0x54, +0x3a, +0x9d, +0x15, +0xd7, +0x27, +0xd6, +0x04, +0x0c, +0x3e, +0x3e, +0xf7, +0xcf, +0xcc, +0x9b, +0x1d, +0xa5, +0x93, +0xf8, +0xd2, +0xe3, +0x1b, +0x7c, +0xce, +0xfc, +0x5f, +0x0a, +0x35, +0xa9, +0x99, +0x37, +0x83, +0xec, +0x2d, +0x7a, +0x6d, +0x07, +0x76, +0x35, +0x02, +0xfd, +0x3c, +0xa8, +0x9e, +0x47, +0x2b, +0xcc, +0x78, +0x52, +0xd3, +0x8d, +0x57, +0xb6, +0x79, +0x3c, +0x09, +0x33, +0x14, +0x61, +0x47, +0x84, +0xd9, +0x3d, +0x63, +0xe8, +0xdb, +0x29, +0xb6, +0x45, +0xe1, +0x9c, +0x54, +0x0d, +0x23, +0xf7, +0x8d, +0x00, +0x9a, +0x03, +0x4b, +0xaf, +0x19, +0xf5, +0x45, +0xa5, +0x97, +0x5d, +0x47, +0x78, +0x8a, +0x99, +0x0d, +0x40, +0xac, +0xf6, +0xa5, +0xf5, +0x3b, +0x4e, +0xdc, +0xd9, +0x36, +0xed, +0xd0, +0x38, +0xee, +0xa0, +0x24, +0x71, +0xbc, +0x69, +0x6d, +0xa4, +0x36, +0xe8, +0xe9, +0xb7, +0x21, +0xd1, +0xb9, +0xe1, +0xd6, +0xa7, +0x4d, +0xf8, +0x7b, +0x12, +0x63, +0x56, +0x8b, +0x21, +0xdd, +0x05, +0x34, +0xed, +0x7c, +0xc9, +0x04, +0x75, +0xfd, +0xd0, +0x38, +0xf0, +0xa7, +0x23, +0x06, +0x77, +0x7c, +0xd5, +0xc2, +0x46, +0xd6, +0xe4, +0x03, +0x5b, +0x13, +0x64, +0x6b, +0xf9, +0xac, +0x5e, +0x6e, +0xd5, +0x55, +0xca, +0xfe, +0x5f, +0x5e, +0x44, +0xa3, +0x88, +0x17, +0xdb, +0x36, +0x93, +0xe6, +0x02, +0x02, +0x89, +0x39, +0x86, +0x3f, +0x5e, +0x7e, +0x91, +0xff, +0xca, +0xbe, +0x95, +0xbd, +0x06, +0xe4, +0xf9, +0x5f, +0x99, +0xa9, +0x1a, +0xc0, +0x5d, +0xd1, +0x29, +0x38, +0x77, +0xb4, +0x00, +0xf3, +0x57, +0x16, +0x34, +0x52, +0x04, +0xe7, +0xf4, +0x6c, +0xf4, +0xb1, +0x75, +0x6d, +0x7a, +0x63, +0x14, +0x5b, +0x9f, +0x56, +0x69, +0x6b, +0x13, +0x8e, +0x76, +0x9c, +0x36, +0xa6, +0x2f, +0xd5, +0x57, +0xcf, +0x20, +0xbc, +0x78, +0x0a, +0x59, +0x0f, +0xdc, +0xf4, +0xde, +0x5d, +0xf1, +0x99, +0x91, +0xd8, +0xf8, +0x82, +0x90, +0x28, +0xe3, +0x2e, +0x75, +0x3d, +0xf1, +0x10, +0xf6, +0xe8, +0xfb, +0x09, +0x84, +0x7a, +0xf0, +0xab, +0xa4, +0x08, +0xf5, +0xfb, +0x62, +0xfe, +0x6a, +0x70, +0x79, +0x52, +0x97, +0xea, +0x13, +0xf0, +0x3b, +0xe1, +0xf6, +0xab, +0x64, +0x89, +0x23, +0x34, +0x20, +0x69, +0xd2, +0xf5, +0x8d, +0x0b, +0x62, +0x5e, +0x04, +0x1c, +0xf1, +0x2b, +0xe9, +0x79, +0x98, +0xa1, +0x34, +0x87, +0x41, +0xe1, +0x83, +0x70, +0x30, +0x94, +0xdc, +0x7a, +0x34, +0x10, +0xea, +0x6c, +0xe2, +0xfb, +0xc8, +0x29, +0x97, +0x26, +0xa0, +0x64, +0x2e, +0x56, +0x90, +0xf2, +0xc6, +0x7d, +0x86, +0x76, +0x41, +0xe3, +0x62, +0x5e, +0x03, +0xda, +0x33, +0x8f, +0x4b, +0x44, +0x7a, +0x9c, +0x16, +0x2d, +0xc5, +0x16, +0xfa, +0x5a, +0x99, +0xa2, +0x51, +0x60, +0x66, +0x01, +0xa9, +0x75, +0x05, +0x44, +0x7e, +0x3b, +0xc0, +0x97, +0xbc, +0x58, +0xe0, +0x0a, +0x18, +0x8f, +0x5a, +0xd1, +0x83, +0x7f, +0x15, +0x18, +0xae, +0xbb, +0x75, +0xb0, +0x5b, +0x9c, +0xa2, +0x12, +0x2d, +0x16, +0x4a, +0x8e, +0xc7, +0x69, +0xd5, +0xc3, +0xa1, +0x39, +0x5d, +0x2d, +0x88, +0xb8, +0x29, +0xa4, +0x5d, +0x4a, +0x76, +0x7c, +0x8b, +0x3b, +0xf0, +0x55, +0xa2, +0x6c, +0xb2, +0xb6, +0xf7, +0xaa, +0x05, +0xb3, +0x19, +0x4c, +0x7e, +0x7d, +0xdf, +0x30, +0xed, +0x1c, +0x61, +0x7b, +0x08, +0x30, +0x03, +0x22, +0x57, +0x15, +0x15, +0x7b, +0x93, +0x5e, +0xbf, +0x30, +0x07, +0xa2, +0xd6, +0xf6, +0xc8, +0xb4, +0xce, +0xb7, +0x96, +0x62, +0x33, +0x83, +0xcd, +0xd4, +0x92, +0x00, +0xc7, +0x4e, +0xf8, +0x5e, +0xb6, +0x1d, +0xcf, +0x04, +0x4a, +0x13, +0x11, +0x7d, +0x2a, +0xa5, +0xd6, +0xb0, +0xee, +0xfd, +0xe8, +0x4f, +0xc0, +0x9d, +0x70, +0xe0, +0x27, +0x41, +0x80, +0x2a, +0x92, +0x79, +0x23, +0xf1, +0x70, +0x5f, +0x1b, +0xcf, +0x89, +0xd5, +0xa5, +0xdb, +0x5c, +0x65, +0xc6, +0x86, +0x37, +0x44, +0xbe, +0xdf, +0x90, +0xd0, +0x2c, +0xe2, +0x24, +0x0e, +0x3e, +0xdb, +0xe3, +0x4f, +0xdd, +0xdf, +0xe1, +0x40, +0x2b, +0x64, +0x58, +0x70, +0x4f, +0xae, +0xc9, +0x8a, +0x67, +0x03, +0x34, +0xba, +0xf4, +0x06, +0x8c, +0xfb, +0x6f, +0x1c, +0x1c, +0xa0, +0x70, +0x72, +0xdd, +0xbc, +0x12, +0xd7, +0xe2, +0xc0, +0x6d, +0x31, +0x19, +0x5c, +0xe2, +0x14, +0x4f, +0x1a, +0x50, +0xb6, +0xb0, +0x1d, +0x26, +0x54, +0x8e, +0xfa, +0x70, +0xf7, +0xeb, +0x58, +0x21, +0x6f, +0xf0, +0x9f, +0x79, +0x4d, +0x7b, +0x32, +0x0a, +0x6b, +0xbe, +0xbd, +0x02, +0x6c, +0xe8, +0xec, +0x48, +0x60, +0x76, +0x98, +0xed, +0x86, +0xb7, +0x1e, +0xf9, +0x68, +0x89, +0x71, +0x09, +0x66, +0x8d, +0xa5, +0x17, +0x77, +0x05, +0xb6, +0xac, +0x6c, +0x80, +0xb3, +0x19, +0x49, +0xf1, +0x6d, +0x28, +0xe3, +0x87, +0x33, +0x67, +0xb1, +0x2f, +0x2a, +0xd8, +0xf1, +0x54, +0x22, +0x88, +0x19, +0x55, +0xc4, +0x08, +0xa6, +0x16, +0x4b, +0x68, +0x9e, +0xd0, +0x88, +0xaa, +0x03, +0x47, +0xd2, +0x3b, +0xf4, +0xb2, +0x38, +0x3d, +0x6a, +0x6d, +0xac, +0x1e, +0x15, +0x09, +0xa3, +0xcf, +0x46, +0x85, +0x12, +0xfe, +0xe2, +0x89, +0xaf, +0xff, +0x49, +0x78, +0x56, +0xbc, +0x8b, +0x56, +0xdd, +0xd4, +0x06, +0xb8, +0x71, +0xa8, +0x9a, +0xb8, +0xd1, +0xdc, +0x7b, +0xdb, +0xbf, +0x31, +0xb0, +0xc8, +0x7e, +0x4f, +0x3b, +0xa4, +0x6c, +0x8d, +0xdf, +0xb7, +0x7c, +0xf3, +0xac, +0xb6, +0x81, +0x6f, +0xf3, +0xff, +0xbe, +0x5a, +0xa5, +0x56, +0x3a, +0x42, +0x8b, +0x75, +0xff, +0xd5, +0x32, +0x42, +0x2b, +0x61, +0x37, +0xf5, +0x60, +0xe8, +0xe0, +0x66, +0x33, +0x65, +0x56, +0x46, +0x86, +0x89, +0x38, +0xb8, +0x34, +0xad, +0xfb, +0x5c, +0x8b, +0xc1, +0x64, +0xa3, +0x59, +0xbd, +0xb6, +0x17, +0xb7, +0x55, +0x9c, +0xf2, +0x12, +0xcc, +0xef, +0x38, +0xd3, +0xc8, +0xee, +0xbf, +0x80, +0x6e, +0x5b, +0x57, +0xfc, +0xef, +0x52, +0xac, +0x67, +0xe1, +0xd6, +0x17, +0xca, +0x6b, +0x45, +0x18, +0xde, +0x3f, +0x8a, +0x06, +0x3f, +0x04, +0x3d, +0x6c, +0xe2, +0xe1, +0xc6, +0x7a, +0x06, +0xbd, +0x1a, +0x0d, +0xbf, +0xf2, +0x1b, +0x99, +0x62, +0x62, +0xf3, +0x85, +0x90, +0x5b, +0xd1, +0xf2, +0x14, +0x49, +0x9a, +0x4c, +0x85, +0x15, +0xf1, +0xc0, +0xd5, +0xf0, +0x13, +0x07, +0x77, +0x67, +0x68, +0xf4, +0x4e, +0x04, +0xfe, +0x56, +0x44, +0xd6, +0x8e, +0x7a, +0xce, +0xe8, +0x6d, +0xcb, +0x6b, +0xcf, +0x4a, +0x36, +0x8f, +0x93, +0x31, +0x59, +0xda, +0x61, +0x60, +0xc3, +0xdd, +0xe2, +0x94, +0xb8, +0xd3, +0xa1, +0xdc, +0x7c, +0x1c, +0x24, +0x42, +0xa6, +0x3c, +0x8f, +0x17, +0x01, +0xc8, +0x64, +0xbd, +0xfd, +0x6b, +0x5a, +0xdb, +0x10, +0x08, +0xf7, +0x7c, +0x50, +0xdc, +0x88, +0x2e, +0xf7, +0xf8, +0x02, +0x26, +0x67, +0x0d, +0xe6, +0x9c, +0xf6, +0x3d, +0xb6, +0x9c, +0x48, +0x3f, +0x4c, +0x27, +0x70, +0x14, +0xe7, +0x84, +0x3a, +0x94, +0x72, +0x9a, +0x42, +0x54, +0xb7, +0x6f, +0xb2, +0x14, +0xf1, +0xe6, +0x4c, +0xd7, +0x17, +0xe1, +0x70, +0x5a, +0x50, +0x39, +0x2b, +0xf6, +0x7a, +0xcb, +0x46, +0xd5, +0x53, +0x2f, +0x27, +0xf0, +0x12, +0xd8, +0xba, +0xb6, +0x39, +0x1d, +0x0b, +0xb3, +0xbf, +0xe9, +0x0d, +0xf5, +0x36, +0x8f, +0xfa, +0x99, +0xc3, +0xef, +0x6c, +0x12, +0x14, +0x97, +0xa5, +0x7d, +0x71, +0x95, +0x68, +0x6f, +0x0f, +0x52, +0x10, +0x4d, +0x06, +0x8f, +0xe4, +0x01, +0x75, +0x7e, +0x06, +0xf5, +0xe6, +0xdd, +0xd4, +0x34, +0x2d, +0x74, +0x3c, +0x5e, +0x20, +0xba, +0xcf, +0x33, +0x09, +0x9f, +0xf2, +0x5e, +0x23, +0x17, +0x35, +0x41, +0xd7, +0xe7, +0x3d, +0x3d, +0x8a, +0x24, +0xbb, +0x45, +0x4e, +0xa9, +0x03, +0x2c, +0x09, +0xd7, +0x4f, +0xaa, +0x70, +0xb6, +0x1c, +0xe2, +0xcb, +0xf6, +0xef, +0x0e, +0x08, +0x90, +0x0c, +0x23, +0xe8, +0x44, +0xd5, +0x5d, +0x83, +0xb2, +0xed, +0x80, +0xdf, +0xf2, +0xa7, +0xaa, +0xfe, +0xa9, +0x43, +0x0e, +0xe8, +0x48, +0xff, +0x0c, +0xc4, +0xd8, +0x79, +0x7c, +0x94, +0x20, +0x83, +0x04, +0xc4, +0x85, +0xc2, +0x46, +0x1a, +0x99, +0x2d, +0xff, +0x65, +0x56, +0x81, +0x11, +0x54, +0x5a, +0xab, +0xe1, +0xc5, +0xad, +0xb3, +0xfe, +0xc3, +0xfd, +0xd4, +0x28, +0xb0, +0xa8, +0xab, +0x1d, +0x5e, +0x37, +0x7c, +0xdb, +0x11, +0xa8, +0x14, +0x57, +0x13, +0x86, +0x6a, +0x81, +0x7e, +0x9c, +0x31, +0x94, +0x89, +0xb5, +0x8c, +0x35, +0xbd, +0xe1, +0xae, +0xf9, +0x5f, +0x74, +0x88, +0x58, +0x7a, +0x0e, +0x2b, +0x7e, +0x4c, +0xd8, +0x38, +0x10, +0xab, +0xfc, +0x22, +0xae, +0xc9, +0x3f, +0x32, +0x85, +0xff, +0x90, +0xd8, +0xb0, +0x2b, +0x5b, +0x1f, +0x92, +0x18, +0x28, +0x5f, +0xf4, +0x58, +0x37, +0x8c, +0xc5, +0xe6, +0xfd, +0x69, +0x27, +0x95, +0xc6, +0x64, +0x0c, +0x0a, +0x0b, +0xe9, +0xbb, +0xf7, +0xb9, +0x10, +0x68, +0xca, +0x34, +0x36, +0xa3, +0x85, +0xb1, +0xd8, +0xdb, +0x48, +0x32, +0x6f, +0xbf, +0x50, +0x63, +0x83, +0x72, +0xb3, +0x87, +0xbd, +0x6b, +0x39, +0xd2, +0x6c, +0xaf, +0x0a, +0x1f, +0xaf, +0x21, +0x87, +0xc7, +0xcc, +0x32, +0x7a, +0xf3, +0xed, +0xce, +0xcd, +0x9c, +0x12, +0x6c, +0x00, +0x93, +0xde, +0x93, +0x52, +0x86, +0x44, +0x6f, +0xdd, +0xbd, +0x8d, +0x67, +0xf2, +0xe6, +0x3d, +0xce, +0xaf, +0xc8, +0x05, +0xc9, +0x16, +0xb7, +0xe6, +0x29, +0x4d, +0xc8, +0xc3, +0x76, +0xbb, +0x83, +0x90, +0x91, +0xd1, +0x68, +0x9c, +0x9a, +0x46, +0xa6, +0xaa, +0x05, +0x8a, +0x92, +0x88, +0x49, +0x35, +0x5e, +0x19, +0x2e, +0xda, +0xa9, +0xf7, +0xbe, +0x31, +0x99, +0xad, +0xcf, +0x2f, +0xcd, +0x65, +0x61, +0xc9, +0xda, +0xc1, +0x91, +0xfd, +0xac, +0x7f, +0xb9, +0xec, +0xdc, +0x50, +0x74, +0x68, +0x7f, +0x18, +0x74, +0xae, +0xe8, +0x46, +0xdf, +0x18, +0x30, +0xf9, +0xb4, +0x96, +0x17, +0x2d, +0x78, +0x89, +0xef, +0x8b, +0xc7, +0x68, +0x8f, +0x3f, +0xc6, +0xec, +0xd8, +0x33, +0xa5, +0xd7, +0xf1, +0xbb, +0x75, +0x9a, +0x53, +0x9d, +0x49, +0x83, +0x49, +0x01, +0x4b, +0x58, +0x93, +0xa8, +0xd9, +0xae, +0x1d, +0xcb, +0xb0, +0x54, +0xa7, +0xfd, +0x3e, +0x60, +0x38, +0x4e, +0x7c, +0xcc, +0x98, +0x12, +0x7c, +0xe9, +0x2b, +0xde, +0x7e, +0x4f, +0x96, +0x87, +0x35, +0xd3, +0xf0, +0x6d, +0x8d, +0x7a, +0x61, +0x50, +0xa5, +0xad, +0x51, +0x9a, +0x6c, +0x04, +0x52, +0x84, +0x2c, +0xfa, +0x90, +0x78, +0x7f, +0xb1, +0x45, +0x8c, +0xb1, +0x09, +0x7e, +0xe9, +0x9f, +0xe6, +0xa0, +0xee, +0x0c, +0x03, +0x13, +0x29, +0xa2, +0x69, +0xc8, +0x74, +0xe9, +0x23, +0x07, +0x3c, +0xd8, +0xa5, +0xff, +0x70, +0xa9, +0x20, +0x83, +0xf0, +0x65, +0x21, +0x89, +0x53, +0x64, +0x0d, +0x25, +0x06, +0xd4, +0xd2, +0x10, +0xc2, +0x86, +0x80, +0xd6, +0x31, +0x69, +0xe9, +0x0a, +0xd3, +0x2f, +0xc2, +0xc9, +0xb6, +0x00, +0xb4, +0xd6, +0x09, +0xe2, +0x4c, +0x92, +0x41, +0x7a, +0x30, +0xef, +0xe3, +0xf2, +0xa8, +0x1e, +0xe7, +0x00, +0x3e, +0x96, +0x39, +0xbc, +0x90, +0x2c, +0x84, +0xa1, +0x1e, +0x79, +0x96, +0xe1, +0x19, +0x7e, +0x69, +0x3a, +0x51, +0x5c, +0x04, +0xf3, +0xfe, +0x44, +0x5c, +0x50, +0x42, +0x28, +0x6e, +0x61, +0xed, +0xc5, +0xa5, +0xe1, +0x04, +0xf9, +0x07, +0x3f, +0xf9, +0xdd, +0xc3, +0x6d, +0x50, +0x16, +0x20, +0x31, +0x0f, +0xa5, +0x89, +0xfd, +0x6d, +0xce, +0x66, +0xfd, +0xd0, +0x6c, +0xe8, +0x6b, +0x1a, +0x6e, +0x6b, +0x81, +0x98, +0x24, +0x83, +0x30, +0x20, +0x33, +0x78, +0x5b, +0xf9, +0x98, +0xe8, +0x06, +0xff, +0x0e, +0x45, +0x65, +0x35, +0x78, +0x5e, +0x7f, +0xe8, +0xce, +0xc8, +0x2c, +0xcf, +0xca, +0x4a, +0xa1, +0x0d, +0x71, +0x3e, +0xa0, +0x15, +0xbf, +0xa9, +0x6a, +0x61, +0x02, +0xb5, +0x88, +0x90, +0xa6, +0xe8, +0xd0, +0x5d, +0xdb, +0x5f, +0x7b, +0xc9, +0xde, +0x4f, +0x0b, +0x6f, +0x84, +0x39, +0x9b, +0xbc, +0x79, +0x30, +0x5c, +0x2e, +0x19, +0x8f, +0x03, +0x9b, +0x2f, +0x68, +0x17, +0xd3, +0x4d, +0x15, +0xfc, +0xea, +0x44, +0xb6, +0x98, +0xd4, +0x05, +0x3f, +0xd7, +0x1b, +0x35, +0x1a, +0x30, +0x4b, +0xbe, +0x1c, +0xcb, +0x94, +0x24, +0xac, +0x45, +0x14, +0xdf, +0x90, +0xbf, +0x0e, +0x98, +0xe9, +0x84, +0xab, +0xb5, +0xfc, +0xe6, +0xec, +0x11, +0x22, +0x75, +0xf3, +0x8e, +0x27, +0xd5, +0x6a, +0xaf, +0xe6, +0x15, +0x8f, +0xbe, +0x3a, +0x63, +0x81, +0xde, +0xe1, +0x41, +0xec, +0x14, +0x8e, +0x24, +0xdc, +0x79, +0xf2, +0x9d, +0xaf, +0xdb, +0x09, +0x3d, +0x99, +0xbb, +0x66, +0xf2, +0xa4, +0x24, +0xb9, +0x86, +0xaa, +0xc6, +0x5b, +0x55, +0xba, +0xb9, +0x28, +0xd1, +0x6a, +0x13, +0x48, +0x68, +0xd1, +0x3c, +0xd8, +0xef, +0xb7, +0x68, +0x99, +0xeb, +0xed, +0x63, +0x6d, +0x09, +0xf9, +0xdf, +0xde, +0x30, +0x72, +0x13, +0xcf, +0x02, +0xba, +0x15, +0x8a, +0x1d, +0xfe, +0x40, +0x02, +0x9f, +0x7b, +0xb5, +0x07, +0xcf, +0x9a, +0x09, +0xb2, +0x32, +0x1f, +0xf6, +0x67, +0x2a, +0x4a, +0x57, +0x33, +0x63, +0x22, +0xaa, +0x5f, +0x94, +0xd1, +0x30, +0x68, +0x48, +0xed, +0x2c, +0xde, +0x66, +0x0c, +0x2c, +0x38, +0xdf, +0x9c, +0x40, +0x03, +0xef, +0x07, +0xac, +0x6c, +0x42, +0xeb, +0xa0, +0x21, +0xc4, +0x9a, +0x04, +0x46, +0xee, +0x8c, +0x88, +0xb0, +0xfd, +0xa9, +0x1e, +0x7b, +0x11, +0x2d, +0x97, +0x11, +0x00, +0xe5, +0x3a, +0x41, +0x25, +0x0d, +0xc0, +0xff, +0xe2, +0x67, +0x7b, +0xf3, +0xf6, +0x4d, +0xa1, +0x71, +0xcd, +0xe7, +0x5a, +0xa2, +0x48, +0x0b, +0x0a, +0x48, +0xe3, +0x9a, +0x35, +0x7a, +0x35, +0x11, +0x4c, +0x0e, +0x9b, +0x51, +0xe2, +0xdd, +0x1c, +0x78, +0x40, +0x67, +0x8b, +0x13, +0x8d, +0x6b, +0xec, +0xb3, +0xfa, +0x5c, +0xfe, +0xcd, +0xbc, +0xa0, +0x34, +0x4a, +0x2e, +0xc6, +0x3b, +0x55, +0x80, +0xbd, +0xb9, +0x71, +0xe0, +0x65, +0xe1, +0x11, +0x31, +0xf8, +0x40, +0xe4, +0x8d, +0x49, +0x2e, +0x7c, +0x00, +0x06, +0xbc, +0x4c, +0xaf, +0xd7, +0x23, +0x66, +0xf1, +0x96, +0xeb, +0x55, +0x54, +0x13, +0x65, +0x11, +0x1a, +0x17, +0x21, +0x17, +0xd6, +0xdd, +0x3f, +0xd1, +0x32, +0xda, +0x3a, +0x2e, +0x8f, +0xe3, +0xa8, +0xd4, +0x7e, +0xe8, +0xca, +0xf3, +0x99, +0x69, +0x4f, +0x56, +0xd9, +0xf5, +0x8d, +0x67, +0x52, +0xaa, +0xa5, +0x58, +0x5e, +0x21, +0xf8, +0x28, +0xef, +0x96, +0x6c, +0x4f, +0x49, +0x4d, +0xd7, +0x1a, +0xda, +0xd9, +0x2f, +0xf9, +0x85, +0x75, +0x24, +0x4e, +0x1a, +0xa4, +0x98, +0x71, +0x7e, +0x1c, +0xed, +0xb7, +0x31, +0x3e, +0x5d, +0x9c, +0x2a, +0xed, +0x16, +0xaf, +0x04, +0x4b, +0x5a, +0x5e, +0xb0, +0xa0, +0x58, +0xe7, +0x31, +0xec, +0xb0, +0x64, +0xb9, +0x60, +0x6f, +0x83, +0xce, +0xef, +0x8f, +0x39, +0x05, +0x3c, +0xca, +0xc6, +0x6d, +0xb0, +0xc6, +0xf3, +0x79, +0xb9, +0x24, +0x6a, +0xc8, +0xd7, +0xb5, +0x15, +0xa7, +0xa7, +0xb9, +0x2c, +0xed, +0x5c, +0x1c, +0xd4, +0xd4, +0x89, +0xdb, +0xf6, +0x40, +0x73, +0xa2, +0x09, +0xbe, +0x96, +0x65, +0xd0, +0x72, +0xc2, +0x94, +0x17, +0x7b, +0x03, +0x8b, +0xd0, +0x2f, +0x67, +0x19, +0x82, +0x56, +0x39, +0xa8, +0x9d, +0x8c, +0x11, +0xa2, +0x68, +0xd4, +0x84, +0x79, +0xba, +0x76, +0x09, +0x27, +0x2a, +0x12, +0xa4, +0x53, +0x32, +0xda, +0x71, +0x98, +0x36, +0x08, +0x9b, +0x73, +0x55, +0x39, +0xc5, +0xd8, +0xe7, +0x4a, +0xb8, +0x0c, +0x0d, +0x83, +0x11, +0xe9, +0x8d, +0xbc, +0x06, +0x60, +0xff, +0x08, +0xe7, +0x14, +0x73, +0xd5, +0x2d, +0x4a, +0xb6, +0x5a, +0x3d, +0x43, +0xea, +0xd1, +0xde, +0x98, +0xf4, +0xe3, +0xc7, +0x6d, +0xbc, +0x59, +0xae, +0xfc, +0x00, +0xf3, +0x2f, +0x57, +0xa6, +0xb0, +0xb2, +0x74, +0xfc, +0x67, +0x9d, +0x9e, +0x46, +0xa0, +0xaa, +0x05, +0x84, +0x10, +0x47, +0x50, +0xc2, +0xc0, +0xda, +0x07, +0xc7, +0x4d, +0xe1, +0xf1, +0x9b, +0x23, +0xd5, +0xf4, +0x77, +0xd5, +0x64, +0xc2, +0xf0, +0x16, +0x63, +0x24, +0x3b, +0x30, +0x15, +0xfa, +0x68, +0x83, +0x40, +0xf2, +0x61, +0xe2, +0x9c, +0xdf, +0xa7, +0x91, +0xf7, +0x39, +0x3d, +0xe8, +0x38, +0x78, +0x28, +0xf3, +0xb5, +0x6a, +0x78, +0x51, +0x74, +0x53, +0x16, +0xed, +0x81, +0x06, +0x0d, +0xda, +0x51, +0xb0, +0x3f, +0xc5, +0x1a, +0x56, +0xae, +0xd0, +0xc2, +0x14, +0x5c, +0xad, +0xfa, +0xd8, +0x2f, +0xef, +0x5d, +0x47, +0x4f, +0x95, +0x8a, +0x27, +0x80, +0x11, +0xd1, +0x6b, +0x43, +0xab, +0xea, +0x57, +0xad, +0xe4, +0xc6, +0x5f, +0x5a, +0x46, +0x66, +0x9f, +0x67, +0x90, +0xf1, +0x3e, +0x7d, +0x02, +0xcc, +0x1d, +0x4e, +0x8c, +0xb2, +0x78, +0x10, +0x68, +0x7f, +0x23, +0x14, +0xea, +0xa0, +0x6c, +0x48, +0xce, +0x23, +0xf2, +0x4c, +0x25, +0x94, +0x80, +0xda, +0x9f, +0xe5, +0x70, +0x4b, +0x1a, +0x94, +0xf3, +0xb3, +0x67, +0xfd, +0x47, +0x0e, +0x87 +}; + + + +/* -----------EXPECTED OUTPUTS----------- */ +uint8_t exp_outp_0 [] = { +0xf7, +0x15, +0x51, +0x51, +0x24, +0x15, +0x55, +0x55, +0x51, +0x5c, +0x50, +0x45, +0x55, +0x11, +0xc0, +0x05, +0x55, +0x15, +0x55, +0x45, +0x55, +0x55, +0x51, +0x45, +0x51, +0xc4, +0x55, +0x55, +0x55, +0x54, +0x55, +0x55, +0x55, +0x14, +0x54, +0x50, +0x51, +0x0c, +0xd5, +0x11, +0x54, +0x55, +0x5d, +0x75, +0x47, +0x50, +0x44, +0x14, +0x10, +0x50, +0x54, +0x35, +0x57, +0x14, +0x44, +0x45, +0x54, +0x55, +0x50, +0x1d, +0x55, +0x11, +0x15, +0x05, +0x54, +0x51, +0xd5, +0x45, +0x51, +0xd5, +0x55, +0x55, +0x54, +0x51, +0x50, +0x45, +0xd4, +0x47, +0x10, +0x4d, +0x55, +0x55, +0x51, +0x51, +0xf4, +0x54, +0xd8, +0x59, +0x05, +0xb3, +0x4f, +0x15, +0x54, +0x50, +0x15, +0xd4, +0xc5, +0x93, +0x45, +0x15, +0x54, +0x55, +0xb5, +0x1c, +0x54, +0x50, +0x55, +0x55, +0x55, +0x41, +0xc5, +0x55, +0x15, +0xc5, +0x10, +0x55, +0x54, +0x51, +0x75, +0x55, +0x55, +0x17, +0x55, +0x55, +0x55, +0x5c, +0x05, +0x55, +0x55, +0x11, +0x55, +0x45, +0xc7, +0x4c, +0x57, +0x55, +0x15, +0x55, +0x54, +0x51, +0x55, +0x51, +0x55, +0x51, +0x41, +0x70, +0x55, +0x15, +0x45, +0x0d, +0x1d, +0x55, +0x4c, +0x51, +0x05, +0x51, +0x55, +0x45, +0xc5, +0x51, +0x51, +0x51, +0x55, +0x15, +0x5d, +0x50, +0x55, +0x55, +0x41, +0x44, +0x15, +0x55, +0x45, +0x54, +0x55, +0x10, +0x55, +0x55, +0x15, +0x55, +0x11, +0x55, +0xf5, +0x55, +0x15, +0x55, +0x55, +0x51, +0x45, +0x05, +0x51, +0x55, +0x54, +0x05, +0x44, +0x71, +0x5c, +0x55, +0x44, +0x55, +0x35, +0x45, +0x1c, +0x50, +0x55, +0x45, +0x55, +0x15, +0x5e, +0x15, +0x11, +0x55, +0x54, +0x55, +0x45, +0x54, +0x54, +0x55, +0x55, +0x55, +0x51, +0x15, +0x05, +0x51, +0x55, +0x51, +0x50, +0x51, +0x45, +0x00, +0x54, +0x75, +0x14, +0x45, +0x5d, +0x44, +0x51, +0x55, +0x15, +0x40, +0xd4, +0x45, +0x45, +0x17, +0x45, +0x55, +0x71, +0x05, +0x40, +0x11, +0x13, +0xd4, +0x15, +0x51, +0x55, +0x01, +0x57, +0x50, +0x55, +0x74, +0x15, +0x55, +0x51, +0x47, +0x15, +0x55, +0x17, +0x55, +0x4d, +0x45, +0x41, +0x04, +0x15, +0x14, +0x51, +0x51, +0x5d, +0x55, +0x31, +0x55, +0x47, +0x55, +0x15, +0x41, +0x55, +0x41, +0x04, +0x55, +0x54, +0x55, +0x54, +0x5e, +0x55, +0x15, +0x44, +0x45, +0xd1, +0x15, +0x55, +0x15, +0x75, +0x54, +0x54, +0x34, +0x15, +0x15, +0x54, +0x55, +0x54, +0x05, +0x55, +0x14, +0x55, +0x14, +0x15, +0x11, +0x51, +0x50, +0x45, +0x55, +0x1d, +0x51, +0x15, +0x10, +0x55, +0x55, +0xd4, +0x45, +0x55, +0x55, +0x55, +0x35, +0x55, +0x15, +0x75, +0x54, +0x5c, +0x54, +0x45, +0x74, +0x51, +0x55, +0x55, +0x75, +0x40, +0x41, +0xd0, +0x5d, +0x51, +0x54, +0x55, +0x41, +0x45, +0x57, +0x75, +0x55, +0x50, +0x05, +0x54, +0x05, +0x74, +0x51, +0x51, +0x75, +0x13, +0x34, +0x15, +0x41, +0x54, +0x5d, +0x5c, +0x5d, +0x53, +0x04, +0x44, +0x41, +0x55, +0x55, +0x54, +0x55, +0x54, +0x01, +0x41, +0x91, +0x45, +0x40, +0xc1, +0x44, +0x50, +0x55, +0x51, +0x51, +0x55, +0x3c, +0x50, +0x55, +0x51, +0x55, +0x50, +0xd5, +0x15, +0x55, +0x5d, +0xc5, +0x55, +0x45, +0x55, +0x41, +0x55, +0x41, +0x54, +0x54, +0x57, +0x75, +0xd0, +0x71, +0x01, +0x75, +0x15, +0x54, +0x45, +0x45, +0x54, +0x71, +0x51, +0x55, +0xc5, +0x5d, +0x55, +0x44, +0x15, +0x47, +0x1c, +0x5d, +0x55, +0x53, +0x55, +0x55, +0x15, +0x55, +0x45, +0x55, +0x01, +0x55, +0x15, +0x55, +0x44, +0x51, +0xd1, +0xd5, +0x14, +0x41, +0x15, +0x54, +0x55, +0x55, +0xd4, +0x15, +0x45, +0x01, +0x71, +0x44, +0x54, +0xd5, +0x54, +0x4d, +0x14, +0x51, +0x01, +0x15, +0x4d, +0x55, +0x55, +0x45, +0x44, +0x75, +0x54, +0x0d, +0x45, +0x01, +0x55, +0xd5, +0x15, +0x45, +0x55, +0x55, +0x55, +0x54, +0xd4, +0x54, +0x55, +0x17, +0x55, +0x55, +0x49, +0x65, +0x14, +0x74, +0x40, +0x5d, +0x55, +0x34, +0x54, +0x55, +0x44, +0xd5, +0x55, +0x55, +0x55, +0x40, +0x54, +0x51, +0xc5, +0x01, +0x15, +0x51, +0x55, +0x54, +0x50, +0x48, +0x45, +0x5d, +0x55, +0x04, +0x54, +0xd1, +0x54, +0x57, +0x55, +0x54, +0x05, +0x11, +0x41, +0x54, +0x35, +0xd5, +0x55, +0x56, +0x15, +0x45, +0x15, +0x50, +0x74, +0x50, +0x40, +0x54, +0x55, +0x55, +0x55, +0x15, +0x6d, +0x51, +0x54, +0x54, +0x75, +0x55, +0x51, +0x51, +0xf1, +0x54, +0x44, +0x51, +0x10, +0x05, +0x00, +0xc7, +0x55, +0x05, +0x5c, +0x74, +0x11, +0x5d, +0x4d, +0x44, +0x34, +0x44, +0x51, +0xd5, +0x55, +0xcd, +0x51, +0x45, +0x47, +0x55, +0x54, +0x11, +0x55, +0x55, +0x05, +0x45, +0x45, +0x55, +0x54, +0x51, +0x45, +0x51, +0x01, +0x15, +0x50, +0x40, +0x44, +0x54, +0x14, +0x55, +0x41, +0x41, +0x01, +0x55, +0x55, +0x55, +0x15, +0x55, +0x44, +0x55, +0x15, +0x07, +0xc4, +0x15, +0x45, +0x5d, +0x55, +0x15, +0x54, +0x51, +0x55, +0x55, +0x51, +0x51, +0x55, +0x10, +0x55, +0x5d, +0x54, +0x11, +0x55, +0x4c, +0xd5, +0x51, +0x45, +0x50, +0x55, +0x45, +0x41, +0x15, +0x57, +0x75, +0x33, +0x55, +0x54, +0x10, +0x15, +0x50, +0x55, +0xc4, +0x7d, +0x55, +0x45, +0x15, +0x45, +0x15, +0x45, +0x55, +0x15, +0x55, +0x51, +0x45, +0x51, +0x55, +0x51, +0x41, +0x15, +0x44, +0x45, +0x55, +0x47, +0x14, +0x54, +0x53, +0x54, +0x51, +0x05, +0x55, +0x55, +0x55, +0x55, +0x55, +0x51, +0x51, +0x15, +0x51, +0x14, +0x55, +0x55, +0x45, +0x54, +0x50, +0x55, +0x55, +0x45, +0x4d, +0x51, +0x50, +0x55, +0x55, +0x50, +0x55, +0x57, +0x41, +0x54, +0x55, +0x04, +0x1d, +0x41, +0x55, +0xd4, +0xdd, +0x4d, +0x14, +0x11, +0x55, +0x51, +0x74, +0x45, +0x01, +0x44, +0x55, +0x55, +0x40, +0x55, +0x51, +0x51, +0x55, +0x55, +0x55, +0x45, +0x75, +0x53, +0x5c, +0x54, +0x55, +0x51, +0x15, +0x55, +0x01, +0x55, +0x55, +0x54, +0x55, +0x15, +0x50, +0x11, +0x50, +0x14, +0x53, +0x55, +0x55, +0x54, +0x15, +0x5c, +0x45, +0x7d, +0x18, +0x14, +0x55, +0x44, +0x55, +0x1d, +0x55, +0x54, +0x05, +0x54, +0x50, +0x05, +0x47, +0x54, +0xdc, +0x51, +0x51, +0x51, +0x14, +0x54, +0x54, +0x55, +0x45, +0x55, +0x50, +0x55, +0x55, +0x15, +0x44, +0x54, +0x51, +0x15, +0x55, +0x51, +0x11, +0x14, +0x54, +0x55, +0x45, +0x41, +0x15, +0x04, +0x55, +0x70, +0x10, +0x54, +0x55, +0x75, +0x55, +0x55, +0x45, +0x54, +0x45, +0x01, +0x55, +0x15, +0x74, +0x35, +0x15, +0x14, +0x51, +0x55, +0xd1, +0x55, +0x55, +0x74, +0x51, +0x11, +0x45, +0x41, +0xd5, +0x15, +0xd0, +0x55, +0x55, +0x05, +0x05, +0x45, +0x4d, +0x1c, +0x55, +0x15, +0x01, +0x75, +0x55, +0x54, +0x15, +0x55, +0x50, +0x51, +0x55, +0x51, +0x55, +0x14, +0xd5, +0xd5, +0x57, +0x15, +0x0c, +0x47, +0x55, +0x05, +0x5c, +0x55, +0x51, +0x55, +0x75, +0x5c, +0x4d, +0x45, +0xd5, +0x50, +0x5d, +0x45, +0x55, +0x54, +0x55, +0x5d, +0x05, +0x51, +0x5d, +0x15, +0x51, +0x54, +0x52, +0x54, +0x55, +0x55, +0x55, +0x14, +0xd4, +0x40, +0x5d, +0x54, +0x45, +0x10, +0x50, +0x55, +0x51, +0x51, +0x11, +0x41, +0x57, +0x55, +0x55, +0xd0, +0x55, +0x13, +0x40, +0x04, +0xd5, +0x45, +0x57, +0x55, +0x51, +0x10, +0x05, +0x55, +0x54, +0x54, +0x51, +0x45, +0xd4, +0x75, +0x55, +0x11, +0x54, +0x45, +0x55, +0x15, +0x55, +0x45, +0x1d, +0x15, +0x55, +0x4d, +0x45, +0x15, +0x11, +0xd4, +0x55, +0x55, +0x55, +0x50, +0x55, +0x45, +0x04, +0x64, +0x50, +0x55, +0x11, +0x14, +0x45, +0x45, +0x50, +0x55, +0x54, +0x45, +0x15, +0x41, +0x45, +0x60, +0x54, +0x15, +0x55, +0x41, +0x55, +0x50, +0x15, +0x55, +0x0d, +0x55, +0x51, +0x45, +0x55, +0x34, +0x15, +0x55, +0x57, +0x50, +0x10, +0x55, +0x50, +0x50, +0x41, +0x05, +0x04, +0x55, +0x55, +0x55, +0x05, +0x01, +0x11, +0x41, +0x05, +0x71, +0x41, +0x54, +0xd4, +0x55, +0x11, +0x55, +0x45, +0x15, +0x55, +0x45, +0x85, +0x05, +0x35, +0x51, +0x45, +0x41, +0x14, +0x40, +0x55, +0x55, +0x15, +0xc5, +0xc5, +0x55, +0x45, +0x55, +0x41, +0x05, +0x05, +0x41, +0x45, +0x50, +0x54, +0x43, +0x11, +0x05, +0x45, +0x54, +0x51, +0x55, +0x11, +0x41, +0xd1, +0x45, +0x44, +0x55, +0x55, +0x55, +0x55, +0x47, +0x05, +0xd1, +0x45, +0x15, +0x51, +0x54, +0x55, +0x55, +0x14, +0x45, +0x55, +0x51, +0x50, +0x44, +0x50, +0x74, +0x17, +0x51, +0x55, +0x57, +0x05, +0x11, +0x11, +0x55, +0x75, +0x51, +0x55, +0x71, +0x05, +0x11, +0x45, +0x57, +0x5c, +0x51, +0x54, +0xd0, +0x54, +0x45, +0x45, +0x54, +0x11, +0x41, +0x5c, +0x50, +0x55, +0xd5, +0x51, +0x17, +0x45, +0x17, +0x05, +0x55, +0x41, +0x51, +0x11, +0x45, +0x54, +0x14, +0x55, +0x54, +0x51, +0x55, +0x15, +0x55, +0x5c, +0xd5, +0x71, +0x34, +0x51, +0x15, +0x54, +0x15, +0x55, +0x55, +0xc4, +0x15, +0x05, +0x55, +0xe1, +0x54, +0x04, +0x05, +0x54, +0x54, +0x51, +0x55, +0x0d, +0x95, +0x55, +0x54, +0x5d, +0x55, +0x55, +0x51, +0x35, +0x01, +0x51, +0x50, +0x04, +0x17, +0x55, +0x5d, +0x55, +0x15, +0x50, +0x51, +0x45, +0x41, +0x55, +0x41, +0x54, +0xd5, +0x55, +0x55, +0xc4, +0xc5, +0x55, +0x54, +0x41, +0x40, +0x50, +0x14, +0x55, +0x45, +0x11, +0x54, +0x05, +0xd5, +0x5d, +0xd4, +0x75, +0x0c, +0x11, +0x55, +0x45, +0x14, +0xc7, +0x55, +0x55, +0x75, +0x15, +0x55, +0x45, +0x55, +0x51, +0x41, +0x55, +0x45, +0x54, +0x51, +0x55, +0x50, +0x75, +0x41, +0x58, +0x5d, +0x55, +0x45, +0x40, +0x51, +0x54, +0x55, +0x55, +0x55, +0x51, +0x45, +0x55, +0x47, +0xd5, +0x45, +0x51, +0x15, +0x44, +0x54, +0x54, +0xd1, +0x55, +0x50, +0x55, +0x55, +0x41, +0x55, +0x11, +0x45, +0x55, +0x55, +0x15, +0xcd, +0x51, +0x11, +0x45, +0x54, +0x57, +0x05, +0x04, +0x55, +0x44, +0x54, +0x59, +0x51, +0x51, +0xd5, +0x55, +0x51, +0x15, +0x55, +0x14, +0x44, +0x17, +0x41, +0x15, +0x55, +0x45, +0x55, +0x65, +0x45, +0x54, +0x44, +0x31, +0xd5, +0x10, +0x51, +0x54, +0x54, +0x51, +0x07, +0x55, +0x11, +0x55, +0x45, +0x55, +0x05, +0x55, +0x55, +0x15, +0x14, +0x51, +0x54, +0x54, +0x11, +0x54, +0x45, +0x75, +0x75, +0x05, +0x05, +0x51, +0x04, +0xd5, +0x54, +0x45, +0x57, +0x55, +0x5d, +0x55, +0x15, +0x55, +0x54, +0x14, +0x65, +0x05, +0x45, +0x53, +0x45, +0x54, +0x54, +0x17, +0x45, +0x55, +0x01, +0xd5, +0x14, +0x10, +0x54, +0x5d, +0x15, +0x55, +0x55, +0x51, +0x45, +0x10, +0x55, +0xc5, +0x55, +0x45, +0x51, +0x55, +0x55, +0x35, +0x40, +0x54, +0x15, +0x55, +0x55, +0x17, +0x4c, +0x54, +0x55, +0x57, +0x72, +0x45, +0x44, +0x5f, +0x07, +0x11, +0x55, +0x75, +0x55, +0x41, +0x45, +0xd7, +0x45, +0xdd, +0x40, +0xd5, +0x40, +0x04, +0x57, +0x5c, +0x55, +0x54, +0x55, +0x55, +0x35, +0x14, +0x47, +0x31, +0x45, +0x51, +0x45, +0x05, +0x51, +0x44, +0x1c, +0x14, +0x51, +0x11, +0x10, +0x00, +0x45, +0x14, +0x51, +0x5d, +0x55, +0x5d, +0x54, +0x45, +0x54, +0x55, +0x15, +0x44, +0xf9, +0x44, +0x54, +0x55, +0x10, +0x45, +0x55, +0x55, +0x51, +0x15, +0x11, +0x55, +0x55, +0x15, +0x14, +0x1c, +0x51, +0x17, +0x41, +0x44, +0x51, +0x41, +0x51, +0x54, +0x75, +0x74, +0x57, +0x17, +0x51, +0xc5, +0x5d, +0x54, +0x15, +0x55, +0x55, +0x75, +0xd5, +0x55, +0x47, +0x31, +0x05, +0x54, +0x11, +0x55, +0x55, +0x15, +0x55, +0xd1, +0x7d, +0x1d, +0x55, +0x15, +0x5d, +0x45, +0x75, +0x1d, +0x5d, +0x51, +0x15, +0x47, +0x05, +0x55, +0x51, +0x54, +0x55, +0x15, +0x55, +0x11, +0x51, +0x43, +0x55, +0x11, +0x15, +0x19, +0x05, +0x40, +0x55, +0x55, +0x15, +0x05, +0x15, +0x14, +0x57, +0x05, +0x54, +0x34, +0x1d, +0x55, +0xd4, +0x14, +0x54, +0x1d, +0x05, +0x57, +0x05, +0x15, +0xd1, +0x55, +0x55, +0x11, +0xf5, +0x55, +0x78, +0x51, +0x54, +0x55, +0x45, +0x55, +0x57, +0x55, +0x51, +0x55, +0x05, +0x5d, +0x55, +0x54, +0x55, +0x51, +0x35, +0x05, +0x55, +0xd5, +0x51, +0x55, +0x50, +0x15, +0x55, +0x55, +0x45, +0x14, +0x55, +0x15, +0x15, +0x04, +0x55, +0x14, +0x51, +0x14, +0x51, +0x55, +0xc5, +0x54, +0x15, +0x5d, +0x14, +0x41, +0x55, +0x40, +0x55, +0x55, +0x55, +0x11, +0x15, +0x15, +0x55, +0x55, +0x04, +0x04, +0x55, +0x55, +0x55, +0x40, +0x51, +0x55, +0x54, +0x70, +0x15, +0x55, +0x51, +0x44, +0x50, +0x54, +0x11, +0x00, +0x55, +0x15, +0x15, +0x75, +0x55, +0xd5, +0x11, +0x54, +0x55, +0x45, +0x10, +0x50, +0x54, +0x15, +0x55, +0x55, +0x44, +0x55, +0x15, +0x5d, +0x14, +0x55, +0x71, +0x55, +0x54, +0x51, +0x54, +0x1c, +0x45, +0x55, +0x55, +0x1c, +0x50, +0x55, +0x14, +0x55, +0x35, +0x41, +0x45, +0x55, +0x51, +0x50, +0x55, +0x1c, +0xd5, +0xcb, +0x57, +0x10, +0x54, +0x71, +0x75, +0x57, +0x55, +0x5d, +0x53, +0x54, +0x44, +0x51, +0x51, +0x35, +0xc1, +0x51, +0x15, +0x55, +0x14, +0x57, +0x45, +0x55, +0x70, +0x74, +0x40, +0x54, +0x54, +0x11, +0x15, +0x45, +0x05, +0x55, +0x55, +0x44, +0x45, +0x55, +0x15, +0x55, +0x45, +0x15, +0x55, +0x55, +0x15, +0x45, +0x51, +0x71, +0x54, +0x54, +0x40, +0x15, +0x55, +0x54, +0x04, +0x44, +0x45, +0x54, +0x51, +0x15, +0x41, +0x14, +0x55, +0x45, +0x45, +0x55, +0x51, +0x11, +0x55, +0xd1, +0x14, +0x55, +0x5d, +0x55, +0x15, +0x94, +0xc5, +0x51, +0x51, +0x55, +0x74, +0x41, +0x45, +0x53, +0x14, +0x45, +0x57, +0xd5, +0x54, +0x34, +0x41, +0x54, +0x75, +0x50, +0x15, +0x14, +0x75, +0x45, +0x55, +0x55, +0x55, +0x05, +0x01, +0x54, +0x43, +0x51, +0x41, +0xd4, +0x14, +0x17, +0x55, +0x54, +0xd5, +0x55, +0x41, +0x55, +0x55, +0x00, +0x5c, +0x55, +0x50, +0x14, +0x05, +0xd5, +0x45, +0x44, +0x15, +0x11, +0x05, +0xd1, +0x55, +0x55, +0xd1, +0x55, +0x55, +0x51, +0x54, +0x55, +0x55, +0x71, +0x14, +0x55, +0x54, +0x17, +0x54, +0x45, +0x55, +0x51, +0x75, +0xd5, +0x51, +0x75, +0x59, +0x50, +0x71, +0x50, +0x40, +0x5d, +0x45, +0x54, +0x15, +0x55, +0x5c, +0x14, +0x51, +0x04, +0x54, +0x55, +0x54, +0x15, +0x04, +0x1d, +0x75, +0x35, +0x15, +0x15, +0x55, +0x50, +0x44, +0x14, +0x45, +0xf3, +0x55, +0x51, +0x44, +0x41, +0x13, +0x55, +0x15, +0x71, +0x15, +0x54, +0x51, +0x50, +0x51, +0x50, +0x55, +0x51, +0x51, +0x55, +0x45, +0x41, +0x55, +0x15, +0x05, +0x45, +0x15, +0xd5, +0xd5, +0x51, +0x51, +0x4c, +0x54, +0x55, +0x54, +0x44, +0x11, +0x05, +0x10, +0x11, +0x45, +0x55, +0x35, +0x5d, +0x51, +0x51, +0x0d, +0x41, +0x45, +0x71, +0x15, +0xdc, +0x55, +0x54, +0x51, +0x50, +0x55, +0x44, +0xd5, +0x14, +0x41, +0x54, +0x4d, +0x15, +0x01, +0x15, +0x54, +0x41, +0x45, +0x55, +0x54, +0x55, +0x15, +0x55, +0x13, +0x55, +0x5c, +0x45, +0x54, +0xd4, +0x11, +0x4c, +0x55, +0x14, +0x51, +0x45, +0x43, +0x55, +0x54, +0x15, +0x51, +0x51, +0x55, +0x55, +0x05, +0x55, +0x04, +0x75, +0x11, +0x51, +0x75, +0xd1, +0x65, +0x14, +0x45, +0x05, +0x11, +0x17, +0x50, +0x55, +0x15, +0x0d, +0x50, +0x55, +0x45, +0x55, +0x15, +0x55, +0x51, +0x50, +0x51, +0x05, +0xc1, +0x15, +0x54, +0x15, +0x15, +0x55, +0x05, +0x11, +0x54, +0x45, +0x57, +0x14, +0x55, +0x54, +0x54, +0x45, +0x11, +0x70, +0x55, +0x55, +0x44, +0x54, +0x45, +0x51, +0xdd, +0x44, +0x41, +0x15, +0x15, +0x14, +0x54, +0x51, +0x55, +0x15, +0x15, +0x11, +0x45, +0x4c, +0x45, +0x45, +0x4d, +0x01, +0x45, +0x55, +0x57, +0x55, +0x51, +0x55, +0x15, +0x50, +0x47, +0x55, +0x55, +0x51, +0x51, +0x54, +0x40, +0x44, +0x55, +0x50, +0x55, +0xc0, +0xd1, +0x05, +0x45, +0x51, +0x45, +0x01, +0x51, +0x15, +0x55, +0x44, +0x44, +0x15, +0x15, +0x55, +0x55, +0x11, +0x54, +0x31, +0x55, +0x54, +0x55, +0x50, +0x15, +0x71, +0xd5, +0x46, +0x41, +0x51, +0x51, +0x55, +0x54, +0x55, +0x15, +0x55, +0x64, +0x50, +0x55, +0x44, +0x41, +0x55, +0x45, +0x54, +0x51, +0x15, +0x05, +0x5d, +0x51, +0x46, +0x55, +0x15, +0x47, +0x15, +0x40, +0x50, +0x54, +0x41, +0x70, +0x50, +0x54, +0x05, +0xd4, +0xc4, +0x11, +0x55, +0x35, +0x44, +0x47, +0x55 +}; + + + +L1_DATA uint32_t threshs_l1 [32] = {0}; +L1_DATA int32_t kappa_l1 [32] = {0}; +L1_DATA int32_t lambda_l1 [32] = {0}; +L1_DATA uint8_t inp_l1 [8192] = {0}; +L1_DATA uint8_t outp_l1 [2048] = {0}; +L1_DATA uint8_t im2col_l1 [IM2COL_DIM] = {0}; + +#endif diff --git a/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_kernels.h b/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_kernels.h new file mode 100644 index 0000000..f4ec977 --- /dev/null +++ b/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_kernels.h @@ -0,0 +1,26528 @@ +/* + * pulp_nn_kernels.h + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN_KERNELS__ +#define __XPULPNN_KERNELS__ + +void xpulp_nn_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif + + +void xpulp_nn_conv_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles); +#endif + +uint8_t *xpulp_nn_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_maxpool_u8( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i8( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u4( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i4( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u2( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i2( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif diff --git a/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_mix_kernels.h b/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_mix_kernels.h new file mode 100644 index 0000000..8b2a3c6 --- /dev/null +++ b/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_mix_kernels.h @@ -0,0 +1,7093 @@ +/* + * pulp_nn_kernels.h + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN-MIXED_KERNELS__ +#define __XPULPNN-MIXED_KERNELS__ + +void xpulp_nn_mix_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif \ No newline at end of file diff --git a/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_utils.h b/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_utils.h new file mode 100644 index 0000000..44d2251 --- /dev/null +++ b/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_utils.h @@ -0,0 +1,2079 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include + +typedef signed short v2s __attribute__((vector_size (4))); + + + +#define min(a,b) ((a)<(b)?(a):(b)) +#define log2(x) __builtin_pulp_fl1(x) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define CHANS_DECOMPR(x) (5*x >> 2) // equivalent to division by 0.8 + +/* Functions for Compressed MAC */ +#define CompressedMAC(sum, ptr, config) asm volatile( \ + "pv.smlsdotsp.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define CompressedMACUnsigned(sum, ptr, config) asm volatile( \ + "pv.smlsdotsup.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define InitNNRF(ptr, config) asm volatile( \ + "pv.smlsdotsp.t x0, %[phtr], %[chonfig];" \ + : [phtr] "+r" (ptr) : [chonfig] "I" (config)) + +#define ThresholdCompress(res, val, thrs) asm volatile( \ + "pv.thrc %[rhes], %[vhal], %[thhrs];" : [rhes] "+r" (res) : [vhal] "r" (val), [thhrs] "r" (thrs)) + +#define GetConfig(a_update, b_update, a_reg, b_reg) a_update << 4 | b_update << 3 | a_reg << 1 | b_reg + +/* Functions for threshold&compress */ +#define check_store(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; \ + incr_val=ch_out_r; } + +#define check_store_4x1(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; } + +#define reset_currThr() \ + if ((uint32_t *) currThr == (uint32_t *) (pThr + ch_out)) { \ + currThr = (v2s *) pThr; \ + } + +#define MacLoads20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define MacLoad20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +/* Functions for compressed min/max */ +#define CompressedMax(res, in1, in2) asm volatile( \ + "pv.max.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define CompressedMin(res, in1, in2) asm volatile( \ + "pv.min.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Max16(res, in1, in2) asm volatile( \ + "pv.max.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Min16(res, in1, in2) asm volatile( \ + "pv.min.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define thr_cmp(state, val, threshs) __builtin_pulp_thresh_compr(state, val, threshs) + +typedef unsigned char v4u __attribute__((vector_size (4))); +typedef signed char v4s __attribute__((vector_size (4))); + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#ifdef __clang__ +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_binsert(dst,not_mask_imm,src,mask_imm,off) +#else +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#endif +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define maxs20(a, b) __builtin_pulp_max20(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define mins20(a, b) __builtin_pulp_min20(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_tnn_zero_mem_ternary(uint8_t * pBuffer, unsigned int size, unsigned int uns) +{ + uint8_t pad_val = 0xd9; + uint32_t pad_vec = 0xd9d9d9d9; + if (uns) { + // if we are using an unsigned kernel, we need to pad with -1 because the hardware will add a +1 to ALL values! + pad_val = 0xff; + pad_vec = 0xffffffff; + } + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u)pad_vec; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=pad_val; + lfover-=4; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_tnn_compare_and_replace_if_larger_ternary(int8_t * base, + int8_t * target, + uint16_t length) +{ + uint8_t mask2 = 0x0c; + uint8_t n_mask2 = ~ mask2; + uint8_t mask4 = 0x30; + uint8_t n_mask4 = ~ mask4; + uint8_t mask6 = 0xc0; + uint8_t n_mask6 = ~ mask6; + uint8_t off2 = 2; + uint8_t off4 = 4; + uint8_t off6 = 6; + + uint8_t *pIn = (uint8_t *) base; + uint8_t *pCom = (uint8_t *) target; + uint8_t *out; + + int cnt = length >> 2; + uint32_t result; + + while(cnt > 0u) + { + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + *((uint32_t *)pIn) = result; + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + if (left>0u) + { + // do the vector max on the whole word - we won't use the leftover bytes + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + + // ...and copy back the relevant bytes of the result to pIn + for (int i=0; i> (8*i)); + + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +#endif diff --git a/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_utils_xpnn.h b/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_utils_xpnn.h new file mode 100644 index 0000000..0c783ae --- /dev/null +++ b/rt_nn_tests/xpnn_maxpool_2b/pulp_nn_utils_xpnn.h @@ -0,0 +1,1937 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include "pmsis.h" +#ifdef GAP_SDK +#include "pulp.h" +#endif + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +#endif diff --git a/rt_nn_tests/xpnn_maxpool_2b/test.c b/rt_nn_tests/xpnn_maxpool_2b/test.c new file mode 100644 index 0000000..d946a63 --- /dev/null +++ b/rt_nn_tests/xpnn_maxpool_2b/test.c @@ -0,0 +1,160 @@ +#include +#include +#include + + +#include "pmsis.h" + +#include "data_statstest.h" +//#include "pulp_nn_kernels.h" +#include "pulp_nn_mix_kernels.h" + + +#define start_cycle_counter() asm volatile("csrw 0xCC0, 0x01;") +#define stop_cycle_counter() asm volatile("csrw 0xCC0, 0x00;") +#define read_cycle_counter(x) asm volatile("csrr %0, 0x780;" : "=r" (x)) +#define reset_cycle_counter() asm volatile("csrw 0x780, 0x0;") + +uint8_t im2col[IM2COL_DIM] = {0}; +uint8_t outputs[OUTPUT_DIM] = {0}; + +int32_t outputs_fp[OUTPUT_DIM_FP] = {0}; + +#ifndef PROFILE +int num_cycles; +#else +int im2col_cycles; +int hotloop_prep_cycles; +int hotloop_cycles; +int threshold_cycles; +int requant_cycles; +int hotloop_leftover_cycles; +int matmul4x2_leftover_cycles; +#endif + +void call_krnl_0(); +void test_0(); + +int main(int argc, char *argv[]) +{ +#if KRAKEN_PTEST == 1 + kraken_padframe_aon_pad_gpioa_cfg_rxe_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_trie_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); +#endif + int n_mismatches = 0; + int mismatches_tot = 0; + + + + #ifndef PROFILE + num_cycles = 0; + #endif + + if (get_core_id() == 0) { + printf("===> TEST 0: Running xpulp_nn_maxpool_i2...\n"); + printf(" dims_in = [32, 32]\n"); + printf(" dims_kernel = [2, 2]\n"); + printf(" ch_in/out = [32, 32]\n"); + //printf(" padding_y_top = [%d]\n", padding_y_top); + //printf(" padding_y_bottom = [%d]\n", padding_y_bottom); + //printf(" padding_x_left = [%d]\n", padding_x_left); + //printf(" padding_x_right = [%d]\n", padding_x_right); + //printf(" stride_x = [%d]\n", stride_x); + //printf(" stride_y = [%d]\n", stride_y); + } + test_0(); + #ifndef PROFILE + stop_cycle_counter(); + read_cycle_counter(num_cycles); + if (get_core_id() == 0) { + printf("===> TEST 0: Finished running xpulp_nn_maxpool_i2\n"); + printf("num_cycles = %d\n", num_cycles); + printf("MACs = 1048576\n"); + printf("MACs/cycle = %.4f\n", 1048576/num_cycles); + } + #endif + if (get_core_id() == 0) { + printf("Checking for mismatches..\n"); + n_mismatches = 0; + + for(int i=0; i < 2048; i++) { + if (outputs[i] != exp_outp_0[i]){ + printf("***Mismatch in test 0 at iteration %d: Expected: %x, got: %x\n", i, exp_outp_0[i], outputs[i]); + n_mismatches++; + } + } + } + mismatches_tot += n_mismatches; + + if (get_core_id() == 0) { + printf("Got %d mismatches in %d tests\n", mismatches_tot, 1); + } + return mismatches_tot; +} + + +void call_krnl_0(void) { + uint8_t * pInp; + uint8_t * pIm2ColBuffer; + int8_t * pBias = NULL; + uint8_t * pOut; + int8_t * pWeight; + uint32_t * pThr; + int32_t * pKappa, pLambda; + pInp = inp_l1; + + pOut = outp_l1; +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 1); + } +#endif + xpulp_nn_maxpool_i2( + pInp, + pOut, + 32, + 32, + 32, + 16, + 16, + 2, + 2, + 0, + 0, + 0, + 0, + 2, + 2); + +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); + } +#endif +} + +void test_0(void) { + // DMA transfer inputs from L2 to L1 + if (pi_core_id() == 0) { + plp_dma_memcpy(pIn_0, inp_l1, 8192, 1); + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + call_krnl_0(); + // get outputs back with DMA + if (pi_core_id() == 0) { + plp_dma_memcpy(outputs, outp_l1, 2048, 0); + plp_dma_barrier(); + } +} + + + + + + + + + + diff --git a/rt_nn_tests/xpnn_maxpool_2b/xpulp_nn_maxpool_i2.c b/rt_nn_tests/xpnn_maxpool_2b/xpulp_nn_maxpool_i2.c new file mode 100644 index 0000000..293bb62 --- /dev/null +++ b/rt_nn_tests/xpnn_maxpool_2b/xpulp_nn_maxpool_i2.c @@ -0,0 +1,146 @@ +/* + * xpulp_nn_maxpool_i2.c + * Nazareno Bruschi + * Angelo Garofalo + * + * Copyright (C) 2018-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" +#include "pulp_nn_utils.h" + + + +void __attribute__ ((noinline)) xpulp_nn_maxpool_i2( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y) +{ + int core_id = pi_core_id(); + int n_cores = NUM_CORES; + if (dim_im_in_y < NUM_CORES) + { + n_cores = dim_im_in_y; + } + int Log2Core = log2(n_cores); + int ch_im_in_r = ch_im_in >> 2; + + int chunck = (dim_im_in_y >> Log2Core) + ((dim_im_in_y & (NUM_CORES-1))!=0); + + int start = min(chunck * core_id, dim_im_in_y); + int stop = min(start + chunck, dim_im_in_y); + int i_x, i_y; + + for (i_y = start; i_y < stop; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + /* for each output pixel */ + int8_t *target = pIn + (i_y * dim_im_in_x + i_x) * ch_im_in_r; + uint8_t *win_start; + uint8_t *win_stop; + if (i_x * stride_x - padding_l < 0) + { + win_start = target; + } + else + { + win_start = pIn + (i_y * dim_im_in_x + i_x * stride_x - padding_l) * ch_im_in_r; + } + + if (i_x * stride_x - padding_l + dim_kernel_x >= dim_im_in_x) + { + win_stop = pIn + (i_y * dim_im_in_x + dim_im_in_x) * ch_im_in_r; + } + else + { + win_stop = pIn + (i_y * dim_im_in_x + i_x * stride_x - padding_l + dim_kernel_x) * ch_im_in_r; + } + + /* first step is to copy over initial data */ + for (int i = 0; i< ch_im_in_r; i++) target[i] = win_start[i]; + + /* start the max operation from the second part */ + win_start += ch_im_in_r; + for (; win_start < win_stop; win_start += ch_im_in_r) + { + xpulp_nn_compare_and_replace_if_larger_i2(target, win_start, ch_im_in_r); + } + } + } + + pi_cl_team_barrier(); + if (dim_im_out_y < NUM_CORES) + { + n_cores = dim_im_out_y; + } + Log2Core = log2(n_cores); + int chunck2 = (dim_im_out_y >> Log2Core) + ((dim_im_out_y & (NUM_CORES-1))!=0); + int start2 = chunck2 * core_id; + int stop2 = min(start2 + chunck2, dim_im_out_y); + + /* then does the pooling along y axis */ + for (i_y = start2; i_y < stop2; i_y++) + { + /* for each output row */ + int8_t *target = pOut + i_y * dim_im_out_x * ch_im_in_r; + int8_t *row_start; + int8_t *row_end; + /* setting the starting row */ + if (i_y * stride_y - padding_t < 0) + { + row_start = pIn; + } + else + { + row_start = pIn + (i_y * stride_y - padding_t) * dim_im_in_x * ch_im_in_r; + } + /* setting the stopping row */ + if (i_y * stride_y - padding_t + dim_kernel_y >= dim_im_in_y) + { + row_end = pIn + dim_im_in_y * dim_im_in_x * ch_im_in_r; + } + else + { + row_end = pIn + (i_y * stride_y - padding_t + dim_kernel_y) * dim_im_in_x * ch_im_in_r; + } + + /* copy over the first row */ + for (int i = 0; i< dim_im_out_x * ch_im_in_r; i++) + { + target[i] = (int8_t) row_start[i]; + } + /* move over to next row */ + row_start += ch_im_in_r * dim_im_in_x; + + for (; row_start < row_end; row_start += dim_im_in_x * ch_im_in_r) + { + xpulp_nn_compare_and_replace_if_larger_i2(target, row_start, dim_im_out_x * ch_im_in_r); + } + } + pi_cl_team_barrier(); +} diff --git a/rt_nn_tests/xpnnv2_conv/Makefile b/rt_nn_tests/xpnnv2_conv/Makefile new file mode 100644 index 0000000..73b2b32 --- /dev/null +++ b/rt_nn_tests/xpnnv2_conv/Makefile @@ -0,0 +1,19 @@ +APP = test +PULP_APP = test +PULP_APP_SRCS = test.c +PULP_APP_SRCS += xpulp_nn_mix_conv_u4_u8_i2.c +PULP_APP_SRCS += xpulp_nn_mix_matmul_u4_u8_i2.c + +CORE=8 + + +PULP_CFLAGS += -DNUM_CORES=$(CORE) -I. -O3 +PULP_LDFLAGS += -lc -lm -lgcc -Wl,-print-memory-usage + +PULP_APP_CFLAGS += -DNUM_CORES=$(CORE) -I. -O3 +PULP_APP_LDFLAGS += -lc -lm -lgcc -Wl,-print-memory-usage + +PULP_CFLAGS += -DPULP_RUNTIME -DSINGLE_CORE_DMA -DALWAYS_BLOCK_DMA_TRANSFERS + + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/rt_nn_tests/xpnnv2_conv/data_statstest.h b/rt_nn_tests/xpnnv2_conv/data_statstest.h new file mode 100644 index 0000000..d679d0f --- /dev/null +++ b/rt_nn_tests/xpnnv2_conv/data_statstest.h @@ -0,0 +1,1894 @@ +// this file is generated automatically by the python script `generate_statstest.py` + +#ifndef _DATA_H +#define _DATA_H + + + +#define IM2COL_DIM (72 * NUM_CORES * 2) // for NUM_CORES cores +#define OUTPUT_DIM 512 +#define OUTPUT_DIM_FP (1) +uint8_t pWeight_0 [] = { +0x29, +0xc5, +0xac, +0x95, +0x91, +0x84, +0x79, +0x41, +0xf7, +0xd6, +0x8a, +0x4e, +0xa8, +0x85, +0x7f, +0x76, +0x57, +0x20, +0x83, +0xa5, +0xfa, +0x8a, +0xd4, +0xf2, +0x58, +0xa0, +0x7d, +0x60, +0x0a, +0x58, +0x02, +0x04, +0x86, +0xfd, +0xcc, +0xfb, +0x8c, +0xc0, +0xc0, +0x88, +0xde, +0x5d, +0x3b, +0x3c, +0x69, +0x80, +0xd0, +0x38, +0x83, +0xe5, +0x35, +0x27, +0x14, +0x5d, +0x2a, +0xe8, +0xd0, +0xf9, +0xcc, +0x81, +0x16, +0x31, +0xd3, +0xc6, +0xf1, +0xfa, +0xe2, +0x5b, +0x08, +0x6d, +0xa5, +0x01, +0xb2, +0x28, +0x54, +0x06, +0x78, +0x97, +0x40, +0x73, +0xd2, +0x27, +0x96, +0xbd, +0x1f, +0xc0, +0xfa, +0xb8, +0x49, +0x68, +0x75, +0x2f, +0xe5, +0xa4, +0xcb, +0xcc, +0x09, +0x39, +0x85, +0x71, +0xc3, +0xfb, +0x6f, +0x9b, +0x42, +0x0f, +0x91, +0x1b, +0x66, +0xaf, +0xfb, +0xc8, +0xf8, +0xba, +0x9d, +0x0a, +0xd2, +0xa8, +0x39, +0x4a, +0x2d, +0xcd, +0xf8, +0xc4, +0x8c, +0x02, +0xe2, +0x4f, +0x68, +0xff, +0x96, +0x7f, +0x96, +0xad, +0xa5, +0xf2, +0xb8, +0xb8, +0x5d, +0x02, +0x4c, +0x19, +0xb0, +0x11, +0x51, +0x8d, +0xb3, +0xb1, +0x3f, +0xcd, +0xaf, +0x85, +0x94, +0x65, +0x24, +0xa1, +0xbb, +0x3e, +0xa1, +0xe2, +0xab, +0xb4, +0x62, +0x6c, +0xf5, +0x2f, +0x5a, +0xbe, +0x68, +0xba, +0x8a, +0xcf, +0xa1, +0x3f, +0xfa, +0xb8, +0xe9, +0xe8, +0xfa, +0x4e, +0xdd, +0x57, +0x45, +0x1a, +0x5c, +0xfd, +0x58, +0x77, +0x52, +0xd6, +0x15, +0x07, +0x39, +0xf2, +0x34, +0xf2, +0x66, +0x90, +0xea, +0x42, +0x42, +0x43, +0x36, +0xd7, +0xc3, +0xb8, +0x2c, +0x10, +0xd0, +0x90, +0x2d, +0xf9, +0x6a, +0xbc, +0x57, +0x19, +0x69, +0x60, +0x65, +0x7f, +0xee, +0x10, +0x06, +0x71, +0xe4, +0x49, +0x5b, +0x5a, +0x5a, +0x9c, +0x6f, +0x87, +0x68, +0x44, +0xb4, +0xf8, +0xe8, +0xae, +0x75, +0xe0, +0x0f, +0x43, +0x51, +0x71, +0xde, +0xce, +0x56, +0x13, +0x3d, +0xf9, +0x80, +0x1f, +0x0a, +0x9e, +0x2b, +0xa7, +0x73, +0x76, +0xe2, +0x36, +0x16, +0x42, +0x0c, +0x73, +0x59, +0xf6, +0x53, +0xaf, +0x77, +0x12, +0x67, +0x33, +0xb7, +0x90, +0x41, +0x55, +0x90, +0xbf, +0x8c, +0x31, +0xfc, +0xdd, +0x9c, +0x42, +0x9a, +0xfd, +0x30, +0xe5, +0x12, +0x12, +0x6c, +0xc3, +0xa6, +0xd8, +0xb0, +0x41, +0x4c, +0xd0, +0x27, +0x97, +0x7f, +0x8b, +0x4c, +0x44, +0xb0, +0x7f, +0xba, +0x18, +0x37, +0x46, +0x9e, +0xdc, +0xbf, +0xd8, +0x97, +0xce, +0x3b, +0x61, +0x7e, +0x9b, +0x36, +0xe7, +0x0c, +0x4e, +0x57, +0x21, +0x36, +0x18, +0x4d, +0x17, +0x41, +0x83, +0xd4, +0x8b, +0xf7, +0x53, +0x4d, +0xff, +0x98, +0xbb, +0x79, +0x38, +0x2e, +0x22, +0x8f, +0xe5, +0x5e, +0x81, +0xdb, +0xf0, +0x2b, +0x16, +0xf6, +0x69, +0xb1, +0xe5, +0xd6, +0xab, +0xc6, +0xb1, +0xd0, +0x62, +0x57, +0xa4, +0x6c, +0xe8, +0x0e, +0x65, +0xb6, +0x18, +0x7a, +0xe7, +0x99, +0x92, +0x68, +0x83, +0x7e, +0xe3, +0xca, +0x61, +0xb0, +0x5e, +0x4d, +0x25, +0x92, +0xe5, +0x02, +0xbb, +0x81, +0xfa, +0x81, +0x92, +0x9d, +0x80, +0xd7, +0xf4, +0x73, +0xe3, +0x63, +0x64, +0x5a, +0x26, +0x98, +0x55, +0x32, +0x65, +0xf9, +0x5c, +0x5d, +0xdc, +0x2e, +0x59, +0xf1, +0x90, +0xf5, +0x78, +0x05, +0xb1, +0x5c, +0xde, +0xdd, +0x93, +0x3f, +0xd8, +0xc2, +0x2a, +0x94, +0xfd, +0x51, +0x72, +0x9c, +0xda, +0x58, +0xc6, +0x94, +0xe5, +0xed, +0x3d, +0xf3, +0x7a, +0xd8, +0x91, +0xbf, +0xeb, +0x31, +0xda, +0xe4, +0x89, +0x74, +0x31, +0x86, +0x42, +0xe2, +0x1b, +0x7e, +0x28, +0xaf, +0xf8, +0xfd, +0x0b, +0x9a, +0xa7, +0x39, +0xd8, +0x9d, +0x25, +0x7f, +0xe5, +0xc4, +0x7f, +0x98, +0x9f, +0x79, +0x5f, +0x4b, +0x7a, +0x05, +0x07, +0x6c, +0xe9, +0x1f, +0xd0, +0x60, +0xfb, +0x7f, +0xf6, +0x5c, +0xa2, +0x3d, +0x2c, +0x22, +0x7b, +0xbe, +0x39, +0x0a, +0xfc, +0x8f, +0x8b, +0x80, +0x6f, +0x6f, +0x1b, +0x28, +0x9b, +0x79, +0x0e, +0xb3, +0x57, +0x07, +0x45, +0x61, +0x83, +0x12, +0x7a, +0x53, +0x3d, +0x80, +0xf1, +0xe1, +0xd1, +0x40, +0x26, +0xfd, +0xd8, +0x81, +0xab, +0xbe, +0x16, +0xfa, +0xf2, +0x25, +0x15, +0x8f, +0xcd, +0xc7, +0xbe, +0xe1, +0xd9, +0x95, +0x3b, +0xbd, +0x6b, +0x81, +0x59, +0x26, +0x27, +0xe8, +0x1b, +0x74, +0x7b, +0x95, +0xb4, +0xe0, +0x60, +0x0b, +0x44, +0x67, +0x7e, +0xb9, +0xfb, +0xc2, +0xc3, +0xb2, +0xd5, +0x22, +0x39, +0xae, +0xb2, +0xd9, +0x2e, +0x63, +0x19, +0x04, +0xe5, +0x69, +0xf5, +0x5f, +0x2a, +0x9f, +0x8f, +0xe2, +0xab, +0x89, +0x8d, +0xd9, +0x23, +0x45, +0x75, +0xa2, +0x2e, +0x1d, +0xc2, +0xe4, +0xf0, +0xe0, +0xbb, +0x26, +0x8b, +0x7c, +0xd4, +0xaa, +0x6d, +0x74, +0x0e, +0xa7, +0x73, +0x44, +0xc2, +0xee, +0x74, +0xd4, +0x07, +0xf9, +0xdd, +0xd6, +0x57, +0x84, +0xdd, +0xee, +0x67, +0x82, +0xd1, +0x52, +0xb3, +0x35, +0xb3, +0xb4, +0x3e, +0x19, +0xd7, +0x55, +0x22, +0xf0, +0xdd, +0xe1, +0x80, +0x15, +0x71, +0xff, +0x58, +0x87, +0xe2, +0xf2, +0xd4, +0x05, +0x68, +0x46, +0xf2, +0x22, +0xd0, +0x46, +0xa8, +0xa4, +0x16, +0x81, +0xc5, +0x7d, +0x1b, +0x50, +0x25, +0x91, +0xcd, +0x29, +0xa6, +0x35, +0x32, +0xf5, +0xff, +0x53, +0xd7, +0x00, +0xf6, +0xab, +0x39, +0xac, +0x85, +0xe9, +0x1a, +0x80, +0x1f, +0x89, +0x30, +0x77, +0x9a, +0x5e, +0xb2, +0xdb, +0x74, +0x53, +0x48, +0x6a, +0x91, +0x11, +0xf0, +0xd8, +0x94, +0x63, +0xbc, +0x32, +0x84, +0x55, +0x04, +0x84, +0x8a, +0x29, +0x38, +0x9c, +0x2c, +0x14, +0x23, +0xfe, +0xb9, +0x88, +0x6e, +0xac, +0xf4, +0xcf, +0xd6, +0x4a, +0xe5, +0x19, +0xad, +0x80, +0x4b, +0x34, +0x0f, +0xbc, +0x93, +0x86, +0x1c, +0x77, +0x1c, +0x7e, +0xda, +0x39, +0xf3, +0xb0, +0x7b, +0xae, +0x0a, +0x4a, +0x49, +0xba, +0x69, +0xf5, +0xa5, +0x05, +0x3b, +0xab, +0xfd, +0xa1, +0xc6, +0xc7, +0x8d, +0x7e, +0x1a, +0x1d, +0xa4, +0x6d, +0x54, +0xe2, +0x1b, +0xb9, +0x46, +0x42, +0xbe, +0xc9, +0x6a, +0xc7, +0x14, +0xa1, +0xd4, +0x27, +0x39, +0x65, +0xf8, +0xb3, +0xf4, +0x1a, +0x31, +0x9a, +0xc5, +0x74, +0x63, +0xb3, +0xa9, +0x10, +0xe2, +0x22, +0x1d, +0xc6, +0x20, +0x00, +0x58, +0xa7, +0xaa, +0x82, +0x25, +0x71, +0xf6, +0xb0, +0x5d, +0xd8, +0xeb, +0xf5, +0x17, +0xd0, +0x04, +0x07, +0xac, +0xf5, +0x63, +0x7a, +0x26, +0x0a, +0x68, +0xe3, +0x74, +0xc5, +0x44, +0x44, +0xd8, +0xf3, +0xa5, +0x99, +0x77, +0xc4, +0x48, +0x47, +0xc0, +0x94, +0xd6, +0x27, +0x79, +0x3d, +0x0a, +0x83, +0x46, +0x0b, +0x8f, +0xd7, +0x03, +0x7c, +0xa7, +0x35, +0xd5, +0x70, +0xb2, +0x19, +0x29, +0x14, +0xb6, +0xbe, +0x61, +0xfb, +0x5c, +0x04, +0xf4, +0x7f, +0xb4, +0xbd, +0xcb, +0x4c, +0xae, +0xd0, +0x20, +0x3e, +0xd9, +0x25, +0xcc, +0xa9, +0xd2, +0x13, +0xf7, +0x66, +0x4e, +0xbe, +0xeb, +0xed, +0x6b, +0xb0, +0x9c, +0xd3, +0x94, +0x21, +0x33, +0x70, +0x16, +0x53, +0xfe, +0x21, +0x08, +0xf5, +0xbc, +0x1b, +0xb1, +0x92, +0x3a, +0xf4, +0xd2, +0xc7, +0x40, +0x96, +0x01, +0xbc, +0xd5, +0x96, +0x35, +0x17, +0xa0, +0x0c, +0x1b, +0x85, +0x2e, +0xb3, +0xab, +0x55, +0xcb, +0xc0, +0xe0, +0x75, +0x61, +0xf8, +0xa6, +0x96, +0xa2, +0x3f, +0x57, +0xd6, +0xb7, +0xfc, +0x8a, +0x6a, +0x6c, +0xcc, +0x38, +0x7f, +0x8c, +0x99, +0x6e, +0x5e, +0x19, +0x23, +0x21, +0xb0, +0xaa, +0x77, +0x43, +0x05, +0x9b, +0x70, +0xd4, +0xde, +0xf8, +0x7d, +0xbb, +0xad, +0x28, +0x4e, +0xc3, +0x5f, +0x97, +0x1d, +0x79, +0xd9, +0x55, +0xce, +0x28, +0x13, +0x91, +0xc0, +0x94, +0x5b, +0x04, +0x70, +0xcc, +0xd8, +0x38, +0xa9, +0x72, +0x0e, +0xd1, +0x93, +0xe4, +0x81, +0x47, +0x29, +0xec, +0x0f, +0x4c, +0x5d, +0x50, +0x6e, +0xd7, +0xa2, +0xe3, +0x4e, +0x58, +0x29, +0x05, +0xa2, +0x37, +0x27, +0xfe, +0x84, +0x51, +0xac, +0xc2, +0xc8, +0x03, +0xe6, +0x5f, +0xea, +0x15, +0x25, +0x7a, +0x39, +0x60, +0xd1, +0xc8, +0x94, +0xb3, +0xf9, +0x8b, +0x20, +0x45, +0xec, +0x75, +0x54, +0xd1, +0xdd, +0x6a, +0x39, +0x31, +0xc9, +0x62, +0x67, +0x84, +0xb3, +0x47, +0xbb, +0x10, +0xb1, +0x0f, +0x5e, +0x11, +0x00, +0x1e, +0xc5, +0x79, +0xf9, +0xd9, +0xb7, +0x18, +0x0a, +0x55, +0x1d, +0x99, +0xd5, +0x46, +0x2a, +0x22, +0xd9, +0x2e, +0xc3, +0x2d, +0xd1, +0x9e, +0xb9, +0x3a, +0x32, +0x1a, +0x13, +0xdb, +0xe5, +0x45, +0x1a, +0x83, +0x38, +0xec, +0x45, +0x7b, +0x87, +0x01, +0x3c, +0x32, +0xd3, +0x60, +0x09, +0xdd, +0xce, +0x87, +0xa3, +0x03, +0x02, +0x91, +0xfa, +0xc5, +0x4d, +0x4d, +0x08, +0x6d, +0xca, +0x9e, +0xfa, +0x7e, +0x8c, +0xdf, +0x3e, +0xb3, +0x10, +0xfb, +0xa3, +0x2f, +0xef, +0x7e, +0x1e, +0x9f, +0xd3, +0x78, +0xfb, +0x5f, +0x50, +0x4c, +0xaa, +0x16, +0x0c, +0x35, +0x46, +0x10, +0x3c, +0x67, +0xaa, +0x95, +0x8a, +0x38, +0xf3, +0xb5, +0x07, +0xdb, +0x6c, +0x52, +0xde, +0xc6, +0x77, +0x0e, +0xd0, +0xf7, +0xea, +0x1f, +0x96, +0xae +}; + +uint8_t pIn_0 [] = { +0x09, +0xe8, +0x1c, +0x6f, +0x55, +0xe1, +0xa3, +0x4d, +0xad, +0x94, +0x4a, +0x9d, +0x90, +0xfc, +0x76, +0xc9, +0x84, +0xce, +0xa8, +0x18, +0xe2, +0x30, +0x85, +0x41, +0x6f, +0x11, +0x09, +0xff, +0x07, +0xb0, +0x5b, +0xe4, +0x40, +0xf3, +0x8f, +0xcd, +0xa3, +0x2f, +0x85, +0x67, +0x31, +0x96, +0xc2, +0x86, +0x1d, +0xe5, +0xc1, +0xee, +0x0b, +0x4d, +0x50, +0xa6, +0x00, +0x69, +0xff, +0x8c, +0x4c, +0xb5, +0x46, +0xd5, +0x14, +0xd0, +0xdb, +0x47, +0xf9, +0x2d, +0x77, +0x9a, +0x92, +0xa4, +0xe8, +0xab, +0xbd, +0xd3, +0x88, +0xb6, +0x00, +0xbc, +0xb0, +0xe7, +0x60, +0xd9, +0x8b, +0x1b, +0x9e, +0x06, +0x95, +0x99, +0x8d, +0x6e, +0x21, +0x20, +0xa3, +0x5e, +0x99, +0x27, +0xb1, +0x7d, +0xce, +0xdc, +0xcf, +0xd6, +0xe8, +0x16, +0xaa, +0xec, +0x67, +0x8c, +0x23, +0xd0, +0x15, +0xa0, +0xe8, +0x86, +0x2b, +0x90, +0xe1, +0xb2, +0x5b, +0x13, +0x19, +0xbe, +0x8c, +0x9e, +0x12, +0x13, +0xeb, +0x69 +}; + +int32_t pLambda_0 [] = { +0x00247d20, +0x001b6100, +0x00124840, +0x00247d20, +0x00247d20, +0x00124840, +0x001b6100, +0x00247d20, +0x00092c20, +0x00247d20, +0x001b6100, +0x00124840, +0x00124840, +0x00124840, +0x00124840, +0x001b6100, +0x00124840, +0x00124840, +0x00092c20, +0x00124840, +0x001b6100, +0x00092c20, +0x00124840, +0x001b6100, +0x001b6100, +0x00247d20, +0x00247d20, +0x00092c20, +0x001b6100, +0x00092c20, +0x00247d20, +0x001b6100 +}; + +int32_t pKappa_0 [] = { +0x00000acb, +0x00000818, +0x00000566, +0x00000acb, +0x00000acb, +0x00000566, +0x00000818, +0x00000acb, +0x000002b3, +0x00000acb, +0x00000818, +0x00000566, +0x00000566, +0x00000566, +0x00000566, +0x00000818, +0x00000566, +0x00000566, +0x000002b3, +0x00000566, +0x00000818, +0x000002b3, +0x00000566, +0x00000818, +0x00000818, +0x00000acb, +0x00000acb, +0x000002b3, +0x00000818, +0x000002b3, +0x00000acb, +0x00000818 +}; + + +/* -----------EXPECTED OUTPUTS----------- */ +uint8_t exp_outp_0 [] = { +0xbb, +0x94, +0x64, +0xc4, +0x8c, +0x6e, +0x90, +0xff, +0x34, +0xbe, +0x8a, +0x76, +0x5f, +0x58, +0x58, +0x88, +0x6a, +0x6b, +0x34, +0x5f, +0x77, +0x30, +0x66, +0xa7, +0xa9, +0xbe, +0xe5, +0x33, +0xc3, +0x2d, +0xbc, +0x80, +0x91, +0x97, +0x53, +0xb6, +0x25, +0x55, +0xa3, +0xc9, +0x29, +0x7f, +0x79, +0x66, +0x58, +0x3e, +0x3c, +0x79, +0x46, +0x68, +0x2d, +0x51, +0x5d, +0x2a, +0x65, +0x92, +0x5b, +0x6c, +0xbd, +0x33, +0x97, +0x26, +0x79, +0x7e, +0xb6, +0x98, +0x45, +0xa0, +0x37, +0x54, +0x9a, +0xd5, +0x20, +0x87, +0x7d, +0x69, +0x5f, +0x43, +0x4b, +0x7e, +0x49, +0x77, +0x26, +0x5f, +0x54, +0x2b, +0x77, +0x91, +0x71, +0x94, +0xec, +0x27, +0x85, +0x27, +0x78, +0x6a, +0xe0, +0xb9, +0x76, +0xb4, +0x9e, +0x6b, +0xc0, +0xf0, +0x34, +0xd6, +0x7d, +0x7a, +0x5a, +0x55, +0x69, +0x86, +0x52, +0x7e, +0x34, +0x6b, +0x80, +0x39, +0x80, +0xc3, +0x90, +0xc0, +0xf0, +0x3d, +0xb4, +0x38, +0xc6, +0xb3, +0xa9, +0x7c, +0x4e, +0x5d, +0x68, +0x74, +0xa2, +0xc5, +0x26, +0xac, +0x7d, +0x5e, +0x44, +0x48, +0x55, +0x77, +0x52, +0x6e, +0x30, +0x60, +0x5d, +0x1f, +0x57, +0x84, +0x8f, +0x99, +0xa4, +0x29, +0x97, +0x2e, +0x80, +0x6f, +0x69, +0x63, +0x42, +0x35, +0x00, +0x61, +0x68, +0x9c, +0x21, +0x82, +0x36, +0x38, +0x10, +0x24, +0x48, +0x39, +0x16, +0x55, +0x14, +0x2c, +0x28, +0x22, +0x43, +0x58, +0x41, +0x4b, +0x8c, +0x21, +0x53, +0x22, +0x58, +0x49, +0x72, +0x62, +0x27, +0x13, +0x0b, +0x52, +0x89, +0x7b, +0x19, +0x93, +0x6b, +0x4c, +0x1b, +0x22, +0x41, +0x36, +0x1a, +0x61, +0x1e, +0x29, +0x4b, +0x12, +0x54, +0x5d, +0x49, +0x68, +0x93, +0x1d, +0x72, +0x26, +0x5a, +0x5b, +0xe8, +0x9d, +0x64, +0x8b, +0x71, +0x65, +0xb3, +0xb4, +0x2e, +0xd2, +0x8e, +0x69, +0x5b, +0x48, +0x6c, +0x6e, +0x33, +0x70, +0x2c, +0x57, +0x67, +0x39, +0x75, +0xb2, +0x80, +0xba, +0xdf, +0x38, +0x9c, +0x33, +0xc1, +0x94, +0x92, +0x67, +0x49, +0x90, +0x5e, +0x75, +0x78, +0xd7, +0x2b, +0xaa, +0x65, +0x72, +0x43, +0x41, +0x56, +0x5e, +0x4c, +0x55, +0x2d, +0x5a, +0x6a, +0x25, +0x54, +0x97, +0x89, +0xcb, +0xab, +0x24, +0x9f, +0x27, +0x84, +0x6b, +0x87, +0x73, +0x3b, +0x25, +0x00, +0x62, +0x63, +0x82, +0x16, +0x7b, +0x38, +0x47, +0x22, +0x1c, +0x49, +0x57, +0x24, +0x58, +0x26, +0x33, +0x25, +0x1c, +0x54, +0x81, +0x31, +0x5f, +0x7f, +0x28, +0x6f, +0x23, +0x52, +0x5b, +0x8c, +0x65, +0x39, +0x34, +0x05, +0x5d, +0x7f, +0x7f, +0x1c, +0x7c, +0x4d, +0x73, +0x11, +0x21, +0x3e, +0x5d, +0x26, +0x55, +0x1a, +0x58, +0x1d, +0x22, +0x43, +0x65, +0x42, +0x56, +0xac, +0x26, +0x5c, +0x2c, +0x4c, +0x5d, +0xf7, +0x87, +0x58, +0x65, +0x59, +0x62, +0x95, +0xa8, +0x27, +0xad, +0x80, +0x5b, +0x51, +0x54, +0x70, +0x6a, +0x34, +0x77, +0x27, +0x75, +0x75, +0x30, +0x6c, +0x95, +0x88, +0xa9, +0xca, +0x2c, +0x7c, +0x37, +0x8f, +0x84, +0xe0, +0xa4, +0x62, +0x7d, +0xaa, +0x77, +0xa7, +0xfd, +0x2f, +0xc5, +0x8a, +0x79, +0x58, +0x68, +0x6b, +0x80, +0x55, +0x7c, +0x43, +0x7b, +0x84, +0x2f, +0x69, +0xa7, +0xa8, +0xf3, +0xbb, +0x35, +0xbd, +0x30, +0xd8, +0x82, +0x9a, +0xa9, +0x5b, +0x60, +0x7f, +0x87, +0x82, +0xc9, +0x2c, +0xcf, +0x7f, +0x71, +0x51, +0x63, +0x61, +0x7d, +0x35, +0x6a, +0x31, +0x5f, +0x63, +0x38, +0x50, +0xaf, +0x6d, +0xd0, +0x8d, +0x31, +0x88, +0x31, +0xc9, +0x8b, +0xaa, +0x8e, +0x6a, +0x6c, +0x82, +0x78, +0x83, +0xc4, +0x31, +0xdc, +0x77, +0x77, +0x4c, +0x57, +0x6c, +0x8b, +0x47, +0x6b, +0x3b, +0x69, +0x7d, +0x36, +0x52, +0xab, +0x7b, +0xda, +0xbc, +0x33, +0xa2, +0x2e, +0xbe, +0x75, +0xf6, +0xab, +0x6a, +0xb6, +0xad, +0x7b, +0xa4, +0xc6, +0x33, +0xe6, +0x8e, +0x7b, +0x77, +0x62, +0x79, +0x84, +0x4a, +0x79, +0x34, +0x7d, +0x88, +0x36, +0x6a, +0xbf, +0x88, +0xf9, +0xdf, +0x3d, +0x98, +0x3a, +0xfd, +0xaa +}; + + + +L1_DATA uint32_t threshs_l1 [32] = {0}; +L1_DATA int32_t kappa_l1 [32] = {0}; +L1_DATA int32_t lambda_l1 [32] = {0}; +L1_DATA uint8_t inp_l1 [128] = {0}; +L1_DATA uint8_t outp_l1 [512] = {0}; +L1_DATA int8_t wt_l1 [1152] = {0}; +L1_DATA uint8_t im2col_l1 [IM2COL_DIM] = {0}; + +#endif diff --git a/rt_nn_tests/xpnnv2_conv/pulp_nn_kernels.h b/rt_nn_tests/xpnnv2_conv/pulp_nn_kernels.h new file mode 100644 index 0000000..f4ec977 --- /dev/null +++ b/rt_nn_tests/xpnnv2_conv/pulp_nn_kernels.h @@ -0,0 +1,26528 @@ +/* + * pulp_nn_kernels.h + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN_KERNELS__ +#define __XPULPNN_KERNELS__ + +void xpulp_nn_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif + + +void xpulp_nn_conv_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles); +#endif + +uint8_t *xpulp_nn_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_maxpool_u8( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i8( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u4( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i4( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u2( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i2( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif diff --git a/rt_nn_tests/xpnnv2_conv/pulp_nn_mix_kernels.h b/rt_nn_tests/xpnnv2_conv/pulp_nn_mix_kernels.h new file mode 100644 index 0000000..8b2a3c6 --- /dev/null +++ b/rt_nn_tests/xpnnv2_conv/pulp_nn_mix_kernels.h @@ -0,0 +1,7093 @@ +/* + * pulp_nn_kernels.h + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN-MIXED_KERNELS__ +#define __XPULPNN-MIXED_KERNELS__ + +void xpulp_nn_mix_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif \ No newline at end of file diff --git a/rt_nn_tests/xpnnv2_conv/pulp_nn_utils.h b/rt_nn_tests/xpnnv2_conv/pulp_nn_utils.h new file mode 100644 index 0000000..44d2251 --- /dev/null +++ b/rt_nn_tests/xpnnv2_conv/pulp_nn_utils.h @@ -0,0 +1,2079 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include + +typedef signed short v2s __attribute__((vector_size (4))); + + + +#define min(a,b) ((a)<(b)?(a):(b)) +#define log2(x) __builtin_pulp_fl1(x) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define CHANS_DECOMPR(x) (5*x >> 2) // equivalent to division by 0.8 + +/* Functions for Compressed MAC */ +#define CompressedMAC(sum, ptr, config) asm volatile( \ + "pv.smlsdotsp.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define CompressedMACUnsigned(sum, ptr, config) asm volatile( \ + "pv.smlsdotsup.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define InitNNRF(ptr, config) asm volatile( \ + "pv.smlsdotsp.t x0, %[phtr], %[chonfig];" \ + : [phtr] "+r" (ptr) : [chonfig] "I" (config)) + +#define ThresholdCompress(res, val, thrs) asm volatile( \ + "pv.thrc %[rhes], %[vhal], %[thhrs];" : [rhes] "+r" (res) : [vhal] "r" (val), [thhrs] "r" (thrs)) + +#define GetConfig(a_update, b_update, a_reg, b_reg) a_update << 4 | b_update << 3 | a_reg << 1 | b_reg + +/* Functions for threshold&compress */ +#define check_store(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; \ + incr_val=ch_out_r; } + +#define check_store_4x1(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; } + +#define reset_currThr() \ + if ((uint32_t *) currThr == (uint32_t *) (pThr + ch_out)) { \ + currThr = (v2s *) pThr; \ + } + +#define MacLoads20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define MacLoad20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +/* Functions for compressed min/max */ +#define CompressedMax(res, in1, in2) asm volatile( \ + "pv.max.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define CompressedMin(res, in1, in2) asm volatile( \ + "pv.min.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Max16(res, in1, in2) asm volatile( \ + "pv.max.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Min16(res, in1, in2) asm volatile( \ + "pv.min.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define thr_cmp(state, val, threshs) __builtin_pulp_thresh_compr(state, val, threshs) + +typedef unsigned char v4u __attribute__((vector_size (4))); +typedef signed char v4s __attribute__((vector_size (4))); + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#ifdef __clang__ +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_binsert(dst,not_mask_imm,src,mask_imm,off) +#else +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#endif +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define maxs20(a, b) __builtin_pulp_max20(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define mins20(a, b) __builtin_pulp_min20(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_tnn_zero_mem_ternary(uint8_t * pBuffer, unsigned int size, unsigned int uns) +{ + uint8_t pad_val = 0xd9; + uint32_t pad_vec = 0xd9d9d9d9; + if (uns) { + // if we are using an unsigned kernel, we need to pad with -1 because the hardware will add a +1 to ALL values! + pad_val = 0xff; + pad_vec = 0xffffffff; + } + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u)pad_vec; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=pad_val; + lfover-=4; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_tnn_compare_and_replace_if_larger_ternary(int8_t * base, + int8_t * target, + uint16_t length) +{ + uint8_t mask2 = 0x0c; + uint8_t n_mask2 = ~ mask2; + uint8_t mask4 = 0x30; + uint8_t n_mask4 = ~ mask4; + uint8_t mask6 = 0xc0; + uint8_t n_mask6 = ~ mask6; + uint8_t off2 = 2; + uint8_t off4 = 4; + uint8_t off6 = 6; + + uint8_t *pIn = (uint8_t *) base; + uint8_t *pCom = (uint8_t *) target; + uint8_t *out; + + int cnt = length >> 2; + uint32_t result; + + while(cnt > 0u) + { + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + *((uint32_t *)pIn) = result; + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + if (left>0u) + { + // do the vector max on the whole word - we won't use the leftover bytes + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + + // ...and copy back the relevant bytes of the result to pIn + for (int i=0; i> (8*i)); + + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +#endif diff --git a/rt_nn_tests/xpnnv2_conv/pulp_nn_utils_xpnn.h b/rt_nn_tests/xpnnv2_conv/pulp_nn_utils_xpnn.h new file mode 100644 index 0000000..0c783ae --- /dev/null +++ b/rt_nn_tests/xpnnv2_conv/pulp_nn_utils_xpnn.h @@ -0,0 +1,1937 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include "pmsis.h" +#ifdef GAP_SDK +#include "pulp.h" +#endif + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +#endif diff --git a/rt_nn_tests/xpnnv2_conv/test.c b/rt_nn_tests/xpnnv2_conv/test.c new file mode 100644 index 0000000..a332ad8 --- /dev/null +++ b/rt_nn_tests/xpnnv2_conv/test.c @@ -0,0 +1,221 @@ +#include +#include +#include + + +#include "pmsis.h" + +#include "data_statstest.h" +//#include "pulp_nn_kernels.h" +#include "pulp_nn_mix_kernels.h" + + +#define start_cycle_counter() asm volatile("csrw 0xCC0, 0x01;") +#define stop_cycle_counter() asm volatile("csrw 0xCC0, 0x00;") +#define read_cycle_counter(x) asm volatile("csrr %0, 0x780;" : "=r" (x)) +#define reset_cycle_counter() asm volatile("csrw 0x780, 0x0;") + +uint8_t im2col[IM2COL_DIM] = {0}; +uint8_t outputs[OUTPUT_DIM] = {0}; + +int32_t outputs_fp[OUTPUT_DIM_FP] = {0}; + +#ifndef PROFILE +int num_cycles; +#else +int im2col_cycles; +int hotloop_prep_cycles; +int hotloop_cycles; +int threshold_cycles; +int requant_cycles; +int hotloop_leftover_cycles; +int matmul4x2_leftover_cycles; +#endif + +void call_krnl_0(); +void test_0(); + +int main(int argc, char *argv[]) +{ +#if KRAKEN_PTEST == 1 + kraken_padframe_aon_pad_gpioa_cfg_rxe_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_trie_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); +#endif + int n_mismatches = 0; + int mismatches_tot = 0; + + + + #ifndef PROFILE + num_cycles = 0; + #endif + + if (get_core_id() == 0) { + printf("===> TEST 0: Running xpulp_nn_mix_conv_u4_u8_i2...\n"); + printf(" dims_in = [4, 4]\n"); + printf(" dims_kernel = [3, 3]\n"); + printf(" ch_in/out = [16, 32]\n"); + //printf(" padding_y_top = [%d]\n", padding_y_top); + //printf(" padding_y_bottom = [%d]\n", padding_y_bottom); + //printf(" padding_x_left = [%d]\n", padding_x_left); + //printf(" padding_x_right = [%d]\n", padding_x_right); + //printf(" stride_x = [%d]\n", stride_x); + //printf(" stride_y = [%d]\n", stride_y); + } + test_0(); + #ifndef PROFILE + stop_cycle_counter(); + read_cycle_counter(num_cycles); + if (get_core_id() == 0) { + printf("===> TEST 0: Finished running xpulp_nn_mix_conv_u4_u8_i2\n"); + printf("num_cycles = %d\n", num_cycles); + printf("MACs = 73728\n"); + printf("MACs/cycle = %.4f\n", 73728/num_cycles); + } + #endif + if (get_core_id() == 0) { + printf("Checking for mismatches..\n"); + n_mismatches = 0; + + for(int i=0; i < 512; i++) { + if (outputs[i] != exp_outp_0[i]){ + printf("***Mismatch in test 0 at iteration %d: Expected: %x, got: %x\n", i, exp_outp_0[i], outputs[i]); + n_mismatches++; + } + } + } + mismatches_tot += n_mismatches; + + if (get_core_id() == 0) { + printf("Got %d mismatches in %d tests\n", mismatches_tot, 1); + } + return mismatches_tot; +} + + +void call_krnl_0(void) { + uint8_t * pInp; + uint8_t * pIm2ColBuffer; + int8_t * pBias = NULL; + uint8_t * pOut; + int8_t * pWeight; + uint32_t * pThr; + int32_t * pKappa, pLambda; + #ifdef PROFILE + int32_t im2col_cycles = 0; + int32_t hotloop_prep_cycles = 0; + int32_t hotloop_cycles = 0; + int32_t threshold_cycles = 0; + int32_t requant_cycles = 0; + int32_t hotloop_leftover_cycles = 0; + int32_t matmul4x2_leftover_cycles = 0; + #endif + pInp = inp_l1; + + pOut = outp_l1; + pIm2ColBuffer = im2col_l1; + pWeight = wt_l1; + pThr = threshs_l1; + pKappa = kappa_l1; + pLambda = lambda_l1; +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 1); + } +#endif + xpulp_nn_mix_conv_u4_u8_i2( + pInp, + pIm2ColBuffer, + pBias, + pOut, + pWeight, + pKappa, + pLambda, + 1, + 13, + 4, + 4, + 16, + 4, + 4, + 32, + 3, + 3, + 1, + 1, + 1, + 1, + 1, + 1, + 1, +#ifndef PROFILE + 1 +#else + 1, + &im2col_cycles, + &hotloop_prep_cycles, + &hotloop_cycles, + &requant_cycles, + &hotloop_leftover_cycles, + &matmul4x2_leftover_cycles +#endif + ); + +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); + } +#endif + #ifdef PROFILE + if (pi_core_id() == 0) { + printf("im2col_cycles = %d\n", im2col_cycles); + printf("hotloop_prep_cycles = %d\n", hotloop_prep_cycles); + printf("hotloop_cycles = %d\n", hotloop_cycles); + printf("requant_cycles = %d\n", requant_cycles); + printf("threshold_cycles = %d\n", threshold_cycles); + printf("hotloop_leftover_cycles = %d\n", hotloop_leftover_cycles); + printf("matmul4x2_leftover_cycles = %d\n", matmul4x2_leftover_cycles); + } + #endif +} + +void test_0(void) { + // DMA transfer inputs from L2 to L1 + if (pi_core_id() == 0) { + plp_dma_memcpy(pIn_0, inp_l1, 128, 1); + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + if (pi_core_id() == 0) { + plp_dma_memcpy(pLambda_0, lambda_l1, 32 * 4, 1); // 4 bytes per lambda item + plp_dma_barrier(); + } + if (pi_core_id() == 0) { + plp_dma_memcpy(pKappa_0, kappa_l1, 32 * 4, 1); // 4 bytes per lambda item + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + // transfer weights + if (pi_core_id() == 0) { + plp_dma_memcpy(pWeight_0, wt_l1, 1152, 1); + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + call_krnl_0(); + // get outputs back with DMA + if (pi_core_id() == 0) { + plp_dma_memcpy(outputs, outp_l1, 512, 0); + plp_dma_barrier(); + } +} + + + + + + + + + + diff --git a/rt_nn_tests/xpnnv2_conv/xpulp_nn_mix_conv_u4_u8_i2.c b/rt_nn_tests/xpnnv2_conv/xpulp_nn_mix_conv_u4_u8_i2.c new file mode 100644 index 0000000..6055ed9 --- /dev/null +++ b/rt_nn_tests/xpnnv2_conv/xpulp_nn_mix_conv_u4_u8_i2.c @@ -0,0 +1,298 @@ +/* + * xpulp_nn_mix_conv_u4_u8_i2.c + * Nazareno Bruschi + * Alessandro Nadalini + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" +#include "pulp_nn_utils.h" +#include "pulp_nn_kernels.h" + + +void __attribute__((noinline)) xpulp_nn_mix_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mult, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batch_norm) +{ + uint16_t ch_in_r = PACK_INT4_SIZE(ch_in); + uint16_t ch_out_r = PACK_INT8_SIZE(ch_out); + + int core_id = pi_core_id(); + uint8_t * pIm2ColBase = pIm2ColBuffer + (2 * core_id * PACK_INT4_SIZE(ch_in) * dim_kernel_x * dim_kernel_y); + int i_out_y, i_out_x, i_ker_y, i_ker_x; + int Log2Core; + + uint8_t extra_chunk = ((dim_out_y & (NUM_CORES-1)) != 0); + uint8_t extra_chunk_r; + uint16_t dim_out_x_r; + uint8_t section; + int core_id_r; + + if(extra_chunk && dim_out_x > 1) + { + Log2Core = log2(NUM_CORES >> 1); + core_id_r = (core_id >> 1); + dim_out_x_r = (dim_out_x >> 1); + section = (core_id & 0x1); + extra_chunk_r = ((dim_out_y & ((NUM_CORES >> 1) - 1)) != 0); + } + else + { + Log2Core = log2(NUM_CORES); + core_id_r = core_id; + dim_out_x_r = dim_out_x; + section = 0; + extra_chunk_r = extra_chunk; + extra_chunk = 0; + } + + uint8_t flag_dim_out_x_odd = dim_out_x & 0x01; + + int chunk = (dim_out_y >> Log2Core) + extra_chunk_r; + + int start_pixel = min((chunk * core_id_r), dim_out_y); + int stop_pixel = min(start_pixel + chunk, dim_out_y); + + uint8_t *pIm2Col = pIm2ColBase; + uint8_t *pOutBuffer = pOut + (start_pixel * ch_out_r * dim_out_x) + (section * ch_out_r * dim_out_x_r); + + for (i_out_y = start_pixel; i_out_y < stop_pixel; i_out_y++) + { + for(i_out_x=(section * dim_out_x_r); i_out_x<(dim_out_x_r + (section * (dim_out_x_r + flag_dim_out_x_odd))); i_out_x++) + { + if(i_out_y < padding_y_top) + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x=((i_out_x * stride_x) - padding_x_left); i_ker_x<((i_out_x * stride_x) - padding_x_left + dim_kernel_x); i_ker_x++) + { + if((i_ker_y < 0) || (i_ker_y >= dim_in_y) || (i_ker_x < 0) || (i_ker_x >= dim_in_x)) + { + xpulp_nn_zero_mem_u4(pIm2Col, ch_in); + } + else + { + xpulp_nn_im2col_u4_to_u4((uint8_t*) (pIn + ((i_ker_y * dim_in_x + i_ker_x) * ch_in_r)), pIm2Col, ch_in); + } + pIm2Col+=PACK_INT4_SIZE(ch_in); + } + } + } + else if(i_out_y < dim_out_y - padding_y_bottom) + { + if(i_out_x < padding_x_left) + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x=((i_out_x * stride_x) - padding_x_left); i_ker_x<((i_out_x * stride_x) - padding_x_left + dim_kernel_x); i_ker_x++) + { + if((i_ker_x < 0) || (i_ker_x >= dim_in_x)) + { + xpulp_nn_zero_mem_u4(pIm2Col, ch_in); + } + else + { + xpulp_nn_im2col_u4_to_u4((uint8_t*) (pIn + ((i_ker_y * dim_in_x + i_ker_x) * ch_in_r)), pIm2Col, ch_in); + } + pIm2Col+=PACK_INT4_SIZE(ch_in); + } + } + } + else if(i_out_x < (dim_out_x - padding_x_right)) + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + xpulp_nn_im2col_u4_to_u4((uint8_t*) pIn + (i_ker_y * dim_in_x + i_out_x * stride_x - padding_x_left)*ch_in_r,pIm2Col,ch_in * dim_kernel_x); + pIm2Col+=PACK_INT4_SIZE(ch_in * dim_kernel_x); + } + } + else + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x = i_out_x * stride_x - padding_x_left; i_ker_x < i_out_x * stride_x - padding_x_left + dim_kernel_x; i_ker_x++) + { + if((i_ker_x < 0) || (i_ker_x >= dim_in_x)) + { + xpulp_nn_zero_mem_u4(pIm2Col, ch_in); + } + else + { + xpulp_nn_im2col_u4_to_u4((uint8_t *)pIn + (i_ker_y*dim_in_x+i_ker_x)* ch_in_r, pIm2Col, ch_in); + } + pIm2Col+=PACK_INT4_SIZE(ch_in); + } + } + } + } + else + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x = i_out_x * stride_x - padding_x_left; i_ker_x < i_out_x * stride_x - padding_x_left + dim_kernel_x; i_ker_x++) + { + if(i_ker_y < 0 || (i_ker_y >= dim_in_y) || i_ker_x < 0 || i_ker_x >= dim_in_x) + { + xpulp_nn_zero_mem_u4(pIm2Col, ch_in); + } + else + { + xpulp_nn_im2col_u4_to_u4((uint8_t *) pIn + (i_ker_y * dim_in_x + i_ker_x) * ch_in_r, pIm2Col, ch_in); + } + pIm2Col+=PACK_INT4_SIZE(ch_in); + } + } + } + if(pIm2Col == (pIm2ColBase + ((PACK_INT4_SIZE(ch_in) * dim_kernel_x * dim_kernel_y) << 1))) + { + pOutBuffer = xpulp_nn_mix_matmul_u4_u8_i2( + pIm2ColBase, + pBias, + pOutBuffer, + pOutBuffer + ch_out_r, + pWeight, + pKappa, + pLambda, + out_mult, + out_shift, + (ch_in * dim_kernel_x * dim_kernel_y), + ch_out, + flag_relu, + flag_batch_norm + ); + + pIm2Col = pIm2ColBase; + } + } + + if(pIm2Col != pIm2ColBase) + { + + MIXED_SKIP("1"); + + const int8_t *pA = pWeight; + int i; + int32_t * k1 = pKappa; + int32_t * lambda1 = pLambda; + v4s inA[2]; + uint16_t num_col_im2col = ch_in * dim_kernel_x * dim_kernel_y; + uint16_t num_col_im2col_w = PACK_INT2_SIZE(ch_in) * dim_kernel_x * dim_kernel_y; + + for(i = 0; i < ch_out; i++) + { + int sum = 0; + if (pBias != NULL) + { + sum = *((int*) pBias); + pBias += 4; + } + + uint8_t *pB = pIm2ColBase; + + int32_t *ptrA = (int32_t *)pA; + uint32_t *ptrB = (uint32_t *)pB; + + for(int j=0; j < (num_col_im2col >> 4); j++) + { + sum = SumDotp8(*(uint32_t *)ptrB, *(int32_t *)ptrA, sum); + + ptrB++; + + sum = SumDotp8(*(uint32_t *)ptrB, *(int32_t *)ptrA, sum); + + ptrA++; + ptrB++; + } + + int col_cnt_im2col = num_col_im2col & 0xf; + + if(col_cnt_im2col) + { + + uint16_t loop_cnt_im2col_a = (num_col_im2col >> 4) << 3; + pB+=loop_cnt_im2col_a; + + do + { + int8_t inA1 = (int8_t) bitext((int) *pA, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pB, 4, 0); + sum += inA1 * inB1; + inA1 = (int8_t) bitext((int) *pA, 2, 2); + inB1 = (uint8_t) bitextu((unsigned int) *pB, 4, 4); + sum += inA1 * inB1; + pB++; + inA1 = (int8_t) bitext((int) *pA, 2, 4); + inB1 = (uint8_t) bitextu((unsigned int) *pB, 4, 0); + sum += inA1 * inB1; + inA1 = (int8_t) bitext((int) *pA, 2, 6); + inB1 = (uint8_t) bitextu((unsigned int) *pB, 4, 4); + sum += inA1 * inB1; + + pA++; + pB++; + col_cnt_im2col-=4; + } while(col_cnt_im2col); + } + if (flag_batch_norm && flag_relu) + { + *pOutBuffer = pulp_nn_bn_quant_u8(sum, *k1, *lambda1, out_shift); + k1++; + lambda1++; + pOutBuffer++; + } + else + { + if(flag_relu == 1) + { + *pOutBuffer = pulp_nn_quant_u8(sum, out_mult, out_shift); + pOutBuffer++; + } + else + { + *pOutBuffer = (uint8_t) clip8(sum >> out_shift); + pOutBuffer++; + } + } + } + } + pOutBuffer+=(extra_chunk * ((dim_out_x_r + ((1 - section) * flag_dim_out_x_odd)) * ch_out_r)); + pIm2Col = pIm2ColBase; + } + pi_cl_team_barrier(); +} diff --git a/rt_nn_tests/xpnnv2_conv/xpulp_nn_mix_matmul_u4_u8_i2.c b/rt_nn_tests/xpnnv2_conv/xpulp_nn_mix_matmul_u4_u8_i2.c new file mode 100644 index 0000000..bd31a9c --- /dev/null +++ b/rt_nn_tests/xpnnv2_conv/xpulp_nn_mix_matmul_u4_u8_i2.c @@ -0,0 +1,470 @@ +/* + * xpulp_nn_mix_matmul_u4_u8_i2.c + * Nazareno Bruschi + * Alessandro Nadalini + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" +#include "pulp_nn_utils.h" + + +uint8_t * __attribute__((noinline)) xpulp_nn_mix_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mult, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batch_norm) +{ + + uint16_t ch_out_r = PACK_INT8_SIZE(ch_out); + + uint16_t num_col_im2col_w = PACK_INT2_SIZE(num_col_im2col); + uint16_t num_col_im2col_a = PACK_INT4_SIZE(num_col_im2col); + + int32_t a_rollback = 4 - num_col_im2col_a; + int32_t w_rollback = 4 - (num_col_im2col_w + (num_col_im2col_w << 1)); + + LEGACY_MODE("0"); + IVEC_FMT("5"); + A_STRIDE(num_col_im2col_a); + W_STRIDE(num_col_im2col_w); + A_ROLLBACK(a_rollback); + W_ROLLBACK(w_rollback); + A_SKIP("1"); + W_SKIP("3"); + MIXED_SKIP("8"); + + int8_t *pA = pWeight; + + uint16_t chan_left = ch_out & 0x3; + + for(int i=0; i < (ch_out >> 2); i++) + { + uint8_t *pB = pIn; + + uint32_t *ptrB = (uint32_t *) pB; + + int32_t *ptrA = (int32_t *) pA ; + + A_ADDRESS(ptrB); + W_ADDRESS(ptrA); + + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + ptrA = MacLoadInit(1, 0, 1, 0, ptrA); + ptrA = MacLoadInit(1, 0, 2, 0, ptrA); + ptrA = MacLoadInit(1, 0, 3, 0, ptrA); + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + + int sum = 0; + int sum2 = 0; + int sum3 = 0; + int sum4 = 0; + int sum5 = 0; + int sum6 = 0; + int sum7 = 0; + int sum8 = 0; + + + if (pBias != NULL) + { + sum = *((int*) pBias); + pBias += 4; + sum2 = *((int*) pBias); + pBias += 4; + sum3 = *((int*) pBias); + pBias += 4; + sum4 = *((int*) pBias); + pBias += 4; + + sum5 = sum; + sum6 = sum2; + sum7 = sum3; + sum8 = sum4; + } + + for(int j=0; j<(num_col_im2col >> 4); j++) + { + ptrB = MacLoadInit(0, 1, 0, 1, ptrB); + + sum = MacLoad8(0, 0, 0, 0, ptrA, sum); + sum2 = MacLoad8(0, 0, 1, 0, ptrA, sum2); + sum3 = MacLoad8(0, 0, 2, 0, ptrA, sum3); + sum4 = MacLoad8(0, 1, 3, 0, ptrB, sum4); + ptrB = MacLoadUpdate(ptrB); + + sum5 = MacLoad8(0, 0, 0, 1, ptrA, sum5); + sum6 = MacLoad8(0, 0, 1, 1, ptrA, sum6); + sum7 = MacLoad8(0, 0, 2, 1, ptrA, sum7); + sum8 = MacLoad8(0, 1, 3, 1, ptrB, sum8); + ptrB = MacLoadUpdate(ptrB); + + + MemoryFence(); + + sum = MacLoad8(0, 0, 0, 0, ptrA, sum); + sum2 = MacLoad8(0, 0, 1, 0, ptrA, sum2); + sum3 = MacLoad8(0, 0, 2, 0, ptrA, sum3); + sum4 = MacLoad8(0, 1, 3, 0, ptrB, sum4); + ptrB = MacLoadUpdate(ptrB); + + sum5 = MacLoad8(1, 0, 0, 1, ptrA, sum5); + ptrA = MacLoadUpdate(ptrA); + + sum6 = MacLoad8(1, 0, 1, 1, ptrA, sum6); + ptrA = MacLoadUpdate(ptrA); + + sum7 = MacLoad8(1, 0, 2, 1, ptrA, sum7); + ptrA = MacLoadUpdate(ptrA); + + sum8 = MacLoad8(1, 0, 3, 1, ptrA, sum8); + ptrA = MacLoadUpdate(ptrA); + } + + asm volatile ("csrr %0, 0x101" : "=r" (pA)); + pA-=4; + + int col_cnt_im2col = num_col_im2col & 0xf; + + if(col_cnt_im2col) + { + + uint16_t loop_cnt_im2col_a = (num_col_im2col >> 4) << 3; + + int8_t *pA2 = (pA + num_col_im2col_w); + int8_t *pA3 = (pA2 + num_col_im2col_w); + int8_t *pA4 = (pA3 + num_col_im2col_w); + + pB+=loop_cnt_im2col_a; + + uint8_t *pB2 = (pB + num_col_im2col_a); + + do + { + int8_t inA = (int8_t) bitext((int) *pA, 2, 0); + int8_t inA2 = (int8_t) bitext((int) *pA2, 2, 0); + int8_t inA3 = (int8_t) bitext((int) *pA3, 2, 0); + int8_t inA4 = (int8_t) bitext((int) *pA4, 2, 0); + + uint8_t inB = (uint8_t)bitextu((unsigned int) *pB, 4, 0); + uint8_t inB2 = (uint8_t)bitextu((unsigned int) *pB2, 4, 0); + + sum += inA * inB; + sum2 += inA2 * inB; + sum3 += inA3 * inB; + sum4 += inA4 * inB; + + sum5 += inA * inB2; + sum6 += inA2 * inB2; + sum7 += inA3 * inB2; + sum8 += inA4 * inB2; + + inA = (int8_t) bitext((int) *pA, 2, 2); + inA2 = (int8_t) bitext((int) *pA2, 2, 2); + inA3 = (int8_t) bitext((int) *pA3, 2, 2); + inA4 = (int8_t) bitext((int) *pA4, 2, 2); + + inB = (uint8_t)bitextu((unsigned int) *pB, 4, 4); + inB2 = (uint8_t)bitextu((unsigned int) *pB2, 4, 4); + + sum += inA * inB; + sum2 += inA2 * inB; + sum3 += inA3 * inB; + sum4 += inA4 * inB; + + sum5 += inA * inB2; + sum6 += inA2 * inB2; + sum7 += inA3 * inB2; + sum8 += inA4 * inB2; + + pB++; + pB2++; + + inA = (int8_t) bitext((int) *pA, 2, 4); + inA2 = (int8_t) bitext((int) *pA2, 2, 4); + inA3 = (int8_t) bitext((int) *pA3, 2, 4); + inA4 = (int8_t) bitext((int) *pA4, 2, 4); + + inB = (uint8_t)bitextu((unsigned int) *pB, 4, 0); + inB2 = (uint8_t)bitextu((unsigned int) *pB2, 4, 0); + + sum += inA * inB; + sum2 += inA2 * inB; + sum3 += inA3 * inB; + sum4 += inA4 * inB; + + sum5 += inA * inB2; + sum6 += inA2 * inB2; + sum7 += inA3 * inB2; + sum8 += inA4 * inB2; + + inA = (int8_t) bitext((int) *pA, 2, 6); + inA2 = (int8_t) bitext((int) *pA2, 2, 6); + inA3 = (int8_t) bitext((int) *pA3, 2, 6); + inA4 = (int8_t) bitext((int) *pA4, 2, 6); + + inB = (uint8_t)bitextu((unsigned int) *pB, 4, 4); + inB2 = (uint8_t)bitextu((unsigned int) *pB2, 4, 4); + + sum += inA * inB; + sum2 += inA2 * inB; + sum3 += inA3 * inB; + sum4 += inA4 * inB; + + sum5 += inA * inB2; + sum6 += inA2 * inB2; + sum7 += inA3 * inB2; + sum8 += inA4 * inB2; + + pA++; + pA2++; + pA3++; + pA4++; + + pB++; + pB2++; + + col_cnt_im2col-=4; + } while(col_cnt_im2col); + } + if (flag_batch_norm && flag_relu) + { + *pOut = pulp_nn_bn_quant_u8(sum, *pKappa, *pLambda, out_shift); + pOut++; + *pOut2 = pulp_nn_bn_quant_u8(sum5, *pKappa, *pLambda, out_shift); + pOut2++; + pKappa++; + pLambda++; + + *pOut = pulp_nn_bn_quant_u8(sum2, *pKappa, *pLambda, out_shift); + pOut++; + *pOut2 = pulp_nn_bn_quant_u8(sum6, *pKappa, *pLambda, out_shift); + pOut2++; + pKappa++; + pLambda++; + + *pOut = pulp_nn_bn_quant_u8(sum3, *pKappa, *pLambda, out_shift); + pOut++; + *pOut2 = pulp_nn_bn_quant_u8(sum7, *pKappa, *pLambda, out_shift); + pOut2++; + pKappa++; + pLambda++; + + *pOut = pulp_nn_bn_quant_u8(sum4, *pKappa, *pLambda, out_shift); + pOut++; + *pOut2 = pulp_nn_bn_quant_u8(sum8, *pKappa, *pLambda, out_shift); + pOut2++; + pKappa++; + pLambda++; + } + else + { + if (flag_relu == 1) + { + *pOut = pulp_nn_quant_u8(sum, out_mult, out_shift); + pOut++; + *pOut = pulp_nn_quant_u8(sum2, out_mult, out_shift); + pOut++; + *pOut = pulp_nn_quant_u8(sum3, out_mult, out_shift); + pOut++; + *pOut = pulp_nn_quant_u8(sum4, out_mult, out_shift); + pOut++; + + *pOut2 = pulp_nn_quant_u8(sum5, out_mult, out_shift); + pOut2++; + *pOut2 = pulp_nn_quant_u8(sum6, out_mult, out_shift); + pOut2++; + *pOut2 = pulp_nn_quant_u8(sum7, out_mult, out_shift); + pOut2++; + *pOut2 = pulp_nn_quant_u8(sum8, out_mult, out_shift); + pOut2++; + + } + else + { + *pOut = (uint8_t) clip8(sum >> out_shift); + pOut++; + *pOut = (uint8_t) clip8(sum2 >> out_shift); + pOut++; + *pOut = (uint8_t) clip8(sum3 >> out_shift); + pOut++; + *pOut = (uint8_t) clip8(sum4 >> out_shift); + pOut++; + + *pOut2 = (uint8_t) clip8(sum5 >> out_shift); + pOut2++; + *pOut2 = (uint8_t) clip8(sum6 >> out_shift); + pOut2++; + *pOut2 = (uint8_t) clip8(sum7 >> out_shift); + pOut2++; + *pOut2 = (uint8_t) clip8(sum8 >> out_shift); + pOut2++; + + } + } + pA+=(3 * num_col_im2col_w); + } + + w_rollback = 4; + W_ROLLBACK(w_rollback); + W_SKIP("0"); + MIXED_SKIP("2"); + + while(chan_left) + { + uint8_t *pB = pIn; + + int8_t *pA = pWeight + (num_col_im2col_w * (ch_out - chan_left)); + + uint32_t *ptrB = (uint32_t *) pB; + + int32_t *ptrA = (int32_t *) pA; + + A_ADDRESS(ptrB); + W_ADDRESS(ptrA); + + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + + int sum = 0; + if (pBias != NULL) + { + sum = ((int) (*pBias++)); + } + int sum2 = sum; + + for(int j=0; j < (num_col_im2col >> 4); j++) + { + ptrB = MacLoadInit(0, 1, 0, 1, ptrB); + + sum = MacLoad8(0, 1, 0, 0, ptrB, sum); + ptrB = MacLoadUpdate(ptrB); + + sum2 = MacLoad8(0, 1, 0, 1, ptrB, sum2); + ptrB = MacLoadUpdate(ptrB); + + sum = MacLoad8(0, 1, 0, 0, ptrB, sum); + ptrB = MacLoadUpdate(ptrB); + + sum2 = MacLoad8(1, 0, 0, 1, ptrA, sum2); + ptrA = MacLoadUpdate(ptrA); + } + asm volatile ("csrr %0, 0x101" : "=r" (pA)); + pA-=4; + int col_cnt_im2col = num_col_im2col & 0xf; + + if(col_cnt_im2col) + { + + uint16_t loop_cnt_im2col_a = (num_col_im2col >> 4) << 3; + pB+=loop_cnt_im2col_a; + + uint8_t *pB2 = (pB +loop_cnt_im2col_a); + + int8_t *pA2 = (pA + num_col_im2col_w); + int8_t *pA3 = (pA2 + num_col_im2col_w); + int8_t *pA4 = (pA3 + num_col_im2col_w); + + do + { + int8_t inA = (int8_t) bitext((int) *pA, 2, 0); + + uint8_t inB = (uint8_t)bitextu((unsigned int) *pB, 4, 0); + uint8_t inB2 = (uint8_t)bitextu((unsigned int) *pB2, 4, 0); + + sum += inA * inB; + + sum2 += inA * inB2; + + inA = (int8_t) bitext((int) *pA, 2, 2); + + inB = (uint8_t)bitextu((unsigned int) *pB, 4, 4); + inB2 = (uint8_t)bitextu((unsigned int) *pB2, 4, 4); + + sum += inA * inB; + + sum2 += inA * inB2; + + pB++; + pB2++; + + inA = (int8_t) bitext((int) *pA, 2, 4); + + inB = (uint8_t)bitextu((unsigned int) *pB, 4, 0); + inB2 = (uint8_t)bitextu((unsigned int) *pB2, 4, 0); + + sum += inA * inB; + + sum2 += inA * inB2; + + inA = (int8_t) bitext((int) *pA, 2, 6); + + inB = (uint8_t)bitextu((unsigned int) *pB, 4, 4); + inB2 = (uint8_t)bitextu((unsigned int) *pB2, 4, 4); + + sum += inA * inB; + + sum2 += inA * inB2; + + pA++; + + pB++; + pB2++; + + col_cnt_im2col-=4; + } while(col_cnt_im2col); + } + if (flag_batch_norm && flag_relu) + { + *pOut = pulp_nn_bn_quant_u8(sum, *pKappa, *pLambda, out_shift); + pOut++; + *pOut2 = pulp_nn_bn_quant_u8(sum2, *pKappa, *pLambda, out_shift); + pOut2++; + pKappa++; + pLambda++; + } + else + { + if (flag_relu == 1) + { + *pOut = pulp_nn_quant_u8(sum, out_mult, out_shift); + pOut++; + *pOut2 = pulp_nn_quant_u8(sum2, out_mult, out_shift); + pOut2++; + } + else + { + *pOut = (uint8_t) clip8(sum >> out_shift); + pOut++; + *pOut2 = (uint8_t) clip8(sum2 >> out_shift); + pOut2++; + } + } + chan_left--; + } + pOut+=ch_out_r; + return pOut; +} diff --git a/rt_nn_tests/xptnn_conv/Makefile b/rt_nn_tests/xptnn_conv/Makefile new file mode 100644 index 0000000..ea4fb4e --- /dev/null +++ b/rt_nn_tests/xptnn_conv/Makefile @@ -0,0 +1,20 @@ +APP = test +PULP_APP = test +PULP_APP_SRCS = test.c +PULP_APP_SRCS += xpulp_tnn_conv_ternary_signed.c +PULP_APP_SRCS += xpulp_tnn_matmul_ternary_signed.c +PULP_APP_SRCS += xpulp_tnn_matmul_ternary_signed_4x1.c + +CORE=8 + + +PULP_CFLAGS += -DNUM_CORES=$(CORE) -I. -O3 +PULP_LDFLAGS += -lc -lm -lgcc -Wl,-print-memory-usage + +PULP_APP_CFLAGS += -DNUM_CORES=$(CORE) -I. -O3 +PULP_APP_LDFLAGS += -lc -lm -lgcc -Wl,-print-memory-usage + +PULP_CFLAGS += -DPULP_RUNTIME -DSINGLE_CORE_DMA -DALWAYS_BLOCK_DMA_TRANSFERS + + +include $(PULP_SDK_HOME)/install/rules/pulp.mk diff --git a/rt_nn_tests/xptnn_conv/data_statstest.h b/rt_nn_tests/xptnn_conv/data_statstest.h new file mode 100644 index 0000000..e9baf5e --- /dev/null +++ b/rt_nn_tests/xptnn_conv/data_statstest.h @@ -0,0 +1,1707 @@ +// this file is generated automatically by the python script `generate_statstest.py` + +#ifndef _DATA_H +#define _DATA_H + + + +#define IM2COL_DIM (36 * NUM_CORES * 2) // for NUM_CORES cores +#define OUTPUT_DIM 160 +#define OUTPUT_DIM_FP (1) +uint8_t pWeight_0 [] = { +0x8d, +0x38, +0x18, +0xbf, +0x48, +0xe8, +0x36, +0xbd, +0xa1, +0x6d, +0x4d, +0xed, +0xaf, +0x30, +0x04, +0x02, +0xec, +0x6d, +0x73, +0x1c, +0x33, +0xc9, +0x45, +0x32, +0x45, +0x9c, +0xff, +0xbd, +0x0e, +0x16, +0x29, +0x1d, +0x6a, +0xb2, +0xf5, +0x28, +0xde, +0xc7, +0x71, +0xdf, +0xc9, +0xbc, +0xfa, +0x6b, +0x9c, +0xa2, +0x24, +0x41, +0x9b, +0x17, +0x69, +0x9a, +0xfe, +0xd8, +0xcb, +0xfb, +0xbc, +0x1f, +0xbc, +0xb8, +0x93, +0x60, +0x24, +0xd9, +0xa7, +0x6f, +0xa0, +0xe3, +0x5d, +0x08, +0x0b, +0x09, +0x80, +0xfd, +0xda, +0x5d, +0xbb, +0x12, +0x47, +0x7e, +0xad, +0x44, +0x8b, +0x6c, +0xd7, +0x58, +0xaf, +0xad, +0x9b, +0xb4, +0x6b, +0xe3, +0xaf, +0xfe, +0x4b, +0xd3, +0x08, +0xfd, +0x57, +0x65, +0x33, +0x1e, +0xa7, +0xe0, +0xfa, +0x2e, +0x2e, +0xc7, +0x0d, +0x90, +0x1d, +0xe3, +0x22, +0x1d, +0x80, +0x9c, +0xc7, +0x48, +0x2e, +0xeb, +0x2c, +0x07, +0xe0, +0xfa, +0xef, +0x9c, +0x02, +0xc6, +0x88, +0xe3, +0x44, +0x4d, +0xa3, +0xc0, +0x76, +0xeb, +0xe0, +0x27, +0xb5, +0xe4, +0xeb, +0xd3, +0x12, +0x06, +0xc7, +0xf7, +0x3f, +0x0f, +0x45, +0x57, +0x57, +0x49, +0xd5, +0xe9, +0xc1, +0xc0, +0x4c, +0xdf, +0x3b, +0xc1, +0x65, +0x2d, +0x35, +0x6b, +0x76, +0x40, +0xa5, +0x21, +0xeb, +0xb1, +0xc3, +0x6e, +0x2c, +0xf0, +0x46, +0x7e, +0x58, +0x2a, +0x0f, +0xcc, +0xe1, +0xad, +0xd0, +0x1d, +0x5f, +0x91, +0x10, +0x73, +0xfa, +0x70, +0xe0, +0xac, +0x04, +0x5e, +0x1c, +0x32, +0xb8, +0xea, +0xc2, +0x97, +0xce, +0xab, +0xa3, +0x5f, +0x9b, +0x76, +0x15, +0x47, +0x36, +0x55, +0xb2, +0x81, +0x04, +0xa8, +0x4f, +0x3e, +0x0f, +0x5f, +0xb6, +0x81, +0x35, +0x43, +0xfa, +0xcc, +0x9b, +0x61, +0x6d, +0xd9, +0x6b, +0x04, +0xda, +0x94, +0x56, +0x83, +0x3d, +0x6a, +0xe2, +0xb2, +0x49, +0x0d, +0x48, +0x02, +0xda, +0xa5, +0x42, +0x61, +0xf4, +0x6a, +0xec, +0x7e, +0xf5, +0x7d, +0x40, +0x0d, +0x74, +0x7e, +0x28, +0x01, +0x80, +0x05, +0xa2, +0xc3, +0x40, +0x2a, +0x6d, +0xaa, +0x31, +0xb6, +0xc7, +0x03, +0x49, +0xec, +0x50, +0x09, +0x69, +0x30, +0xa6, +0x18, +0x70, +0x71, +0x2d, +0x5f, +0x8f, +0xb6, +0x60, +0x70, +0x1e, +0x97, +0x4d, +0xef, +0x87, +0x2a, +0x3a, +0x5d, +0xaf, +0x08, +0xc6, +0x76, +0x5b, +0x93, +0x8f, +0x21, +0x3c, +0xa9, +0x49, +0x7d, +0xa5, +0xfe, +0xee, +0xaf, +0xd0, +0x67, +0x54, +0x2f, +0x6b, +0xaf, +0x2e, +0xbc, +0x1b, +0xac, +0xc6, +0xaf, +0xa8, +0xda, +0x49, +0x83, +0xc3, +0xbb, +0x61, +0xc0, +0xe2, +0x4b, +0xeb, +0x73, +0xfd, +0x3e, +0xb4, +0x9d, +0xa8, +0xcc, +0xd7, +0x5d, +0x82, +0x40, +0xde, +0xc4, +0x82, +0x2c, +0x7f, +0xf1, +0x07, +0x1a, +0x9d, +0xc6, +0x9c, +0xd4, +0x49, +0xde, +0xde, +0x74, +0x17, +0xe5, +0xcd, +0x2a, +0x2f, +0xdc, +0xc7, +0x3e, +0x9b, +0xdd, +0xc7, +0x20, +0xfd, +0xab, +0x58, +0x16, +0xfe, +0xc0, +0x46, +0x2a, +0xa3, +0x0a, +0x01, +0x38, +0x15, +0xa0, +0xfc, +0xd4, +0x35, +0x81, +0xb3, +0xfb, +0x2f, +0x40, +0x18, +0x9b, +0xf1, +0x17, +0x23, +0xd8, +0x12, +0x35, +0x85, +0xc1, +0x53, +0xca, +0xd5, +0x1c, +0x3a, +0x92, +0x9f, +0x1b, +0x4c, +0x1d, +0x5a, +0xe9, +0x42, +0x5c, +0x20, +0xe4, +0xdb, +0xdb, +0x8a, +0xed, +0xaa, +0xb2, +0x89, +0x1e, +0x01, +0x3d, +0x8f, +0x16, +0x41, +0xaf, +0x96, +0x2f, +0x3d, +0xbf, +0xfe, +0xc3, +0x2c, +0x4b, +0xd4, +0xad, +0xf6, +0xb1, +0xc5, +0x22, +0x2a, +0xaa, +0x8c, +0x47, +0xa3, +0xf3, +0xe1, +0x3c, +0x35, +0x15, +0x34, +0x17, +0x84, +0x2b, +0x56, +0x0a, +0xfe, +0x18, +0x64, +0xfa, +0xe9, +0xea, +0xde, +0x39, +0xfc, +0x3a, +0x82, +0xdf, +0x0c, +0x3f, +0x17, +0x24, +0x9f, +0x36, +0x82, +0xef, +0x71, +0xbd, +0x2d, +0x5d, +0xe8, +0x6f, +0x78, +0x07, +0x37, +0x5c, +0xb2, +0x31, +0x42, +0x0a, +0x18, +0x62, +0x1d, +0x06, +0x27, +0x9a, +0x09, +0xa1, +0x6b, +0xad, +0xb7, +0x33, +0x07, +0xc0, +0x1a, +0x7d, +0xc1, +0x75, +0xba, +0x5e, +0xdd, +0xe0, +0x3a, +0x49, +0xd1, +0x2d, +0x29, +0x1d, +0xd9, +0xef, +0xe7, +0x69, +0xe9, +0x41, +0x10, +0xb1, +0x1f, +0x95, +0x34, +0xd6, +0xa4, +0x1b, +0x1b, +0x36, +0x24, +0xb3, +0x55, +0xc5, +0x3b, +0x85, +0x7a, +0xc9, +0x30, +0x86, +0x0d, +0x28, +0x15, +0x1c, +0x9f, +0x3e, +0x56, +0x12, +0x68, +0xf0, +0xd6, +0x6d, +0x4b, +0x2b, +0xa5, +0xd5, +0xc0, +0x41, +0x90, +0x01, +0xc4, +0x30, +0x66, +0x8c, +0x47, +0x7f, +0x62, +0xa3, +0xd0, +0x70, +0x63, +0x5f, +0x47, +0x27, +0x85, +0x22, +0xa1, +0xaf, +0x02, +0x51, +0x76, +0x73, +0x07, +0x38, +0xe7, +0x97, +0xcc, +0xa2, +0xba, +0x8c, +0x05, +0x7c, +0xe7, +0xd1, +0xaf, +0xe3, +0x88, +0x00, +0xac, +0xf8, +0x03, +0x6c, +0xc9, +0x80, +0x16, +0x39, +0x48, +0x29, +0x3c, +0xf8, +0xe0, +0x27, +0xed, +0x24, +0x2b, +0x4c, +0x3c, +0x1c, +0x2e, +0x07, +0x63, +0xca, +0x0b, +0x10, +0xc3, +0xcc, +0xbf, +0x49, +0x47, +0x31, +0x5d, +0xd0, +0x11, +0x86, +0xc8, +0x9f, +0xa5, +0x11, +0xe4, +0x02, +0x20, +0x30, +0xf1, +0x83, +0x31, +0x88, +0xf0, +0x07, +0x13, +0x21, +0xb3, +0x4b, +0x0d, +0x01, +0xf0, +0x35, +0x77, +0x92, +0x63, +0xad, +0x1f, +0x9a, +0x68, +0x03, +0xcf, +0xfe, +0x7b, +0x05, +0x29, +0xde, +0x66, +0x39, +0xa3, +0x0b, +0xf1, +0xe2, +0x98, +0xcb, +0x01, +0x30, +0x32, +0x20, +0x57, +0x03, +0xb1, +0x41, +0xb0, +0xee, +0xff, +0xad, +0x7c, +0x4a, +0x60, +0xf3, +0xe1, +0x30, +0xf1, +0xea, +0x41, +0x9f, +0xac, +0x63, +0x58, +0x3a, +0xf1, +0xf0, +0x03, +0xa1, +0x3e, +0x1a, +0x2a, +0x2d, +0x40, +0xa5, +0x29, +0x50, +0xb5, +0xf5, +0xe6, +0xa9, +0x7f, +0x2c, +0xf1, +0x09, +0x44, +0xee, +0xe0, +0x12, +0xbf, +0x6b, +0x5d, +0x38, +0x36, +0x73, +0x7b, +0xad, +0x82, +0x74, +0x04, +0xaf, +0xd1, +0x0a, +0x37, +0xa0, +0x3b, +0x5d, +0x88, +0x08, +0xcd, +0xfe, +0x86, +0x45, +0x31, +0x6a, +0xe8, +0x8b, +0xfa, +0xfa, +0xd8, +0x54, +0x8d, +0x04, +0x36, +0xf0, +0x51, +0x1a, +0xa6, +0xb7, +0x8d, +0xea, +0x54, +0xa8, +0xd7, +0x86, +0xf1, +0x3b, +0x5e, +0x1f, +0xd4, +0x63, +0x15, +0x28, +0xe4, +0x04, +0x25, +0x34, +0x92, +0x0e, +0x2e, +0xe8, +0x88, +0x2a, +0x40, +0x55, +0x25, +0x8f, +0x5c, +0x4a, +0x4c, +0x64, +0x29, +0xe6, +0xa3, +0xc4, +0x44, +0x02, +0xfc, +0xa3, +0xc4, +0x37, +0xfe, +0x77, +0xc1, +0xf3, +0x03, +0x9b, +0xb1, +0x3f, +0xfe, +0x05, +0x89, +0xa9, +0xaa, +0x51, +0xd1, +0x82, +0x20, +0x5e, +0x7e, +0xa6, +0xf8, +0x92, +0x60, +0x60, +0xab, +0x85, +0x24, +0xd6, +0xe5, +0xd0, +0x16, +0x22, +0xcf, +0xc3, +0x25, +0xdc, +0xaa, +0x5a, +0x46, +0x02, +0xc6, +0x71, +0x1e, +0x01, +0xde, +0xf7, +0x56, +0x0d, +0x4b, +0x60, +0x6a, +0xfd, +0x08, +0x6c, +0x36, +0x5b, +0x02, +0xb8, +0xd6, +0x92, +0xf1, +0x4f, +0x2b, +0xd8, +0x6c, +0x26, +0xf1, +0x73, +0xdf, +0x4b, +0xe8, +0x3b, +0x32, +0x73, +0xd7, +0xe1, +0x77, +0x0e, +0x4f, +0x44, +0x26, +0x60, +0x43, +0x08, +0x67, +0x3c, +0x01, +0x2b, +0x97, +0x4a, +0x4e, +0x0d, +0x7a, +0x8d, +0x3a, +0xf5, +0xb8, +0xdb, +0x87, +0xb1, +0xa7, +0xec, +0x49, +0x8d, +0x1a, +0xe9, +0x3b, +0x05, +0x7a, +0x47, +0xff, +0x68, +0x5d, +0x98, +0x1a, +0xd0, +0xec, +0xde, +0x89, +0x23, +0x82, +0xc9, +0x23, +0x26, +0x1f, +0x97, +0xbb, +0x4f, +0xfe, +0xc6, +0xdc, +0x07, +0xe0, +0x6b, +0x3d, +0x0e, +0x1c, +0xc7, +0x2e, +0x4c, +0x01, +0x0b, +0xe2, +0x14, +0x8f, +0x63, +0x2c, +0x03, +0x98, +0xa8, +0x46, +0x29, +0x3f, +0x65, +0x9a, +0xa2, +0xe3, +0x95, +0x91, +0xe4, +0x23, +0x16, +0xfd, +0x5d, +0x9f, +0xde, +0x1c, +0xdc, +0x60, +0x9a, +0xba, +0x1c, +0x6f, +0x07, +0xa6, +0xac, +0x30, +0xe8, +0x6e, +0x71, +0xb0, +0xc3, +0x1a, +0x92, +0xe8, +0x4a, +0x4c, +0x3c, +0x8c, +0x77, +0xcb, +0xf3, +0x4a, +0x14, +0x21, +0x17, +0xbb, +0x73, +0x38, +0x7f, +0xa2, +0x0a, +0xcc, +0xd6, +0xd7, +0x23, +0x07, +0x1e, +0x38, +0xca, +0x32, +0xd4, +0xfb, +0xb6, +0x09, +0x5a, +0x03, +0x77, +0x54, +0xdd, +0xd8, +0x9a, +0xab, +0x8f, +0x51, +0x54, +0xa3, +0x14, +0x18, +0xc0, +0x86, +0xe4, +0xb8, +0x66, +0xc0, +0x6d, +0x8b, +0xa9, +0x40, +0xb3, +0xf7, +0x0e, +0xd3, +0x27, +0xe9, +0x96, +0xa4, +0xeb, +0x0b, +0x7b, +0x0e, +0x74, +0x9d, +0x28, +0xb9, +0x6b, +0xfe, +0xcd, +0x0b, +0x01, +0xf4, +0xad, +0x57, +0x5b, +0xc5, +0xad, +0xc9, +0xba, +0xf7, +0x83, +0x13, +0xa9, +0xf4, +0x85, +0x9a, +0x2a, +0x6a, +0x8c, +0xc4, +0x24, +0xfc, +0x4b, +0x70, +0xca, +0xc7, +0x58, +0xb8, +0x3b, +0xa1, +0xd0, +0x2c, +0xea, +0xe9, +0xe1, +0x50, +0x6a, +0x60, +0x2d, +0xd5, +0xa5, +0xe4, +0xce, +0xcd, +0xb8, +0xf3, +0x42, +0x43, +0xc9, +0x05, +0x97, +0xea, +0xfc, +0x36, +0xba, +0x04, +0x14, +0x86, +0x75, +0x2b, +0xc6, +0x56, +0x3f, +0x88, +0x45, +0xfa, +0x1b, +0xb5, +0x45, +0x8a, +0xa0, +0x0b, +0x50, +0x80, +0x58, +0x1f, +0x9d, +0x54, +0xe6, +0x87, +0x30, +0x6e, +0x0d, +0x1b, +0xe6, +0x91, +0xa5, +0xf7, +0x7a, +0x7e, +0x73, +0x47, +0x23, +0x92, +0xe1, +0x9b, +0xd4, +0xa5, +0x1e, +0x08, +0x17, +0x4c, +0xc0, +0x74, +0xe3, +0x45, +0x8d, +0x25, +0x74, +0x66, +0x8b, +0x76, +0x1a, +0x0a, +0x49, +0x1a, +0xbd, +0x65, +0xc9, +0x49, +0xea, +0x10, +0x5e, +0xe7, +0x5b, +0xf0, +0x53, +0x76, +0x08, +0x5b, +0xcd, +0x42, +0xc4, +0xb5, +0x1d, +0x55, +0x01, +0x46, +0x73, +0xdb, +0x6b, +0xee, +0xc7, +0xd4, +0xdf, +0x61, +0x61, +0xb0, +0x71, +0x84, +0xa6, +0xa8, +0x86, +0xa3, +0x35, +0xd8, +0x09, +0x35, +0x31, +0x4c, +0xa2, +0xb2, +0x0f, +0x1b, +0xd5, +0x88, +0x11, +0x5b, +0x69, +0xe8, +0x5f, +0x1b, +0xe3, +0xa5, +0xa8, +0x82, +0xc2, +0x16, +0xea, +0xc8, +0x33, +0x0e, +0xcc, +0xb7, +0x1a, +0x34, +0x61, +0xdf, +0x57, +0xdb, +0x46, +0xbb, +0xcc, +0x98, +0xa3, +0xda, +0x7e, +0xfe, +0xa7, +0x58, +0x3d, +0x5a, +0xe5, +0x16, +0xd9, +0xc7, +0xfc, +0x0f, +0x46, +0x23, +0x60, +0x04, +0x9c, +0xd5, +0x11, +0x43, +0xc0, +0x6a, +0xf3, +0x35, +0x7a, +0x84, +0xa7, +0x91, +0x1d, +0xd6, +0x91, +0xe3, +0xfb, +0x73, +0x88, +0xcf, +0x9a, +0xd4, +0x5e, +0xad, +0xb3, +0xdd, +0x2b, +0xf1, +0x65, +0x61, +0xe0, +0xfe, +0x00, +0x44, +0x76, +0x0d, +0xd9, +0xfc, +0x03, +0x75, +0x37, +0x81, +0x91, +0x5c, +0x3c, +0xfc, +0x63, +0x90, +0x48, +0xef, +0x14, +0xf8, +0xd7, +0xdf, +0xf4, +0x6c, +0x13, +0x20, +0xd9, +0x3e, +0x2a, +0x40, +0x2c, +0x49, +0x3d, +0x5e, +0xdd, +0xf8, +0x11, +0x84, +0x7c, +0x3d, +0x1a, +0x5e, +0xc5, +0x7d, +0x32, +0xa5, +0xab, +0xd4, +0x69, +0x08, +0x86, +0xa8, +0x74, +0x74, +0x25, +0x68, +0x4b, +0x24, +0x40, +0x4d, +0xcb, +0xf7, +0x05, +0x09, +0xc2, +0x39, +0xa9, +0xdd, +0x00, +0xc8, +0x94, +0x0a, +0x5d, +0x13, +0x8d, +0xa5, +0xaa, +0xff, +0x29, +0x36, +0xfc, +0x58, +0x6d, +0x6a, +0x7e, +0xab, +0xf0, +0x86, +0x6f, +0xc2, +0x45, +0x87, +0xa7, +0xb8, +0xb5, +0x16, +0x9d, +0xea, +0x2a, +0x25, +0xb0, +0x60, +0xe9, +0xb5, +0x06, +0x17, +0x62, +0x64, +0xf7, +0xcb, +0x84, +0x7c, +0xb6, +0xa0, +0x26 +}; + +uint8_t pIn_0 [] = { +0x05, +0xaa, +0xc9, +0x2c, +0x10, +0x4f, +0x05, +0x8a, +0x9b, +0xbf, +0xc9, +0xed, +0x5a, +0x84, +0x4a, +0x85, +0x1d, +0x18, +0xe1, +0x88, +0xa4, +0x4c, +0x33, +0x9c, +0x65, +0xf3, +0x2f, +0x58, +0x0a, +0xf6, +0xa5, +0xcd, +0x5e, +0x01, +0x33, +0xe6, +0x5c, +0xab, +0x9d, +0x09, +0xc4, +0x7d, +0x95, +0x57, +0xeb, +0x3d, +0x01, +0x17, +0x86, +0x8a, +0xbd, +0x1d, +0x97, +0x84, +0x75, +0x46, +0xa7, +0xfe, +0xf6, +0x69, +0xca, +0x41, +0xc2, +0xdc +}; + +uint32_t pThr_0 [] = { +0xff63ff84, +0xff7dff97, +0xffa5ffbf, +0x00350065, +0xff6dff7a, +0xff5fffa1, +0x001f0020, +0x000d0051, +0x003c004d, +0xff6fffa3, +0x007800cf, +0x00240061, +0x0025003e, +0x00150040, +0xffc6fff4, +0x00890099, +0xff51ff92, +0x00620063, +0xffda0023, +0x001e004e, +0x00550089, +0xff5cff9e, +0xffd4fff9, +0x0039003d, +0xff99ffa4, +0x000c0038, +0xffc2fff1, +0x003c005e, +0x0079009a, +0xff8bffc3, +0xffe7001e, +0x00560060, +0xff5eff80, +0x009800ae, +0xff6eff93, +0x00930094, +0x009800ef, +0xffcc0008, +0x000c0031, +0x00430095 +}; + + +/* -----------EXPECTED OUTPUTS----------- */ +uint8_t exp_outp_0 [] = { +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xbc, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xdf, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x63, +0x53, +0xf4, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xbc, +0x47, +0x77, +0x7f, +0xd0, +0x4f, +0x6f, +0x53, +0xf4 +}; + + + +L1_DATA uint32_t threshs_l1 [40] = {0}; +L1_DATA int32_t kappa_l1 [40] = {0}; +L1_DATA int32_t lambda_l1 [40] = {0}; +L1_DATA uint8_t inp_l1 [64] = {0}; +L1_DATA uint8_t outp_l1 [128] = {0}; +L1_DATA int8_t wt_l1 [1440] = {0}; +L1_DATA uint8_t im2col_l1 [IM2COL_DIM] = {0}; + +#endif diff --git a/rt_nn_tests/xptnn_conv/pulp_nn_kernels.h b/rt_nn_tests/xptnn_conv/pulp_nn_kernels.h new file mode 100644 index 0000000..f4ec977 --- /dev/null +++ b/rt_nn_tests/xptnn_conv/pulp_nn_kernels.h @@ -0,0 +1,26528 @@ +/* + * pulp_nn_kernels.h + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN_KERNELS__ +#define __XPULPNN_KERNELS__ + +void xpulp_nn_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif + + +void xpulp_nn_conv_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles); +#endif + +uint8_t *xpulp_nn_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_maxpool_u8( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i8( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u4( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i4( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u2( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i2( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif diff --git a/rt_nn_tests/xptnn_conv/pulp_nn_mix_kernels.h b/rt_nn_tests/xptnn_conv/pulp_nn_mix_kernels.h new file mode 100644 index 0000000..8b2a3c6 --- /dev/null +++ b/rt_nn_tests/xptnn_conv/pulp_nn_mix_kernels.h @@ -0,0 +1,7093 @@ +/* + * pulp_nn_kernels.h + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN-MIXED_KERNELS__ +#define __XPULPNN-MIXED_KERNELS__ + +void xpulp_nn_mix_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif \ No newline at end of file diff --git a/rt_nn_tests/xptnn_conv/pulp_nn_utils.h b/rt_nn_tests/xptnn_conv/pulp_nn_utils.h new file mode 100644 index 0000000..44d2251 --- /dev/null +++ b/rt_nn_tests/xptnn_conv/pulp_nn_utils.h @@ -0,0 +1,2079 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include + +typedef signed short v2s __attribute__((vector_size (4))); + + + +#define min(a,b) ((a)<(b)?(a):(b)) +#define log2(x) __builtin_pulp_fl1(x) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define CHANS_DECOMPR(x) (5*x >> 2) // equivalent to division by 0.8 + +/* Functions for Compressed MAC */ +#define CompressedMAC(sum, ptr, config) asm volatile( \ + "pv.smlsdotsp.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define CompressedMACUnsigned(sum, ptr, config) asm volatile( \ + "pv.smlsdotsup.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define InitNNRF(ptr, config) asm volatile( \ + "pv.smlsdotsp.t x0, %[phtr], %[chonfig];" \ + : [phtr] "+r" (ptr) : [chonfig] "I" (config)) + +#define ThresholdCompress(res, val, thrs) asm volatile( \ + "pv.thrc %[rhes], %[vhal], %[thhrs];" : [rhes] "+r" (res) : [vhal] "r" (val), [thhrs] "r" (thrs)) + +#define GetConfig(a_update, b_update, a_reg, b_reg) a_update << 4 | b_update << 3 | a_reg << 1 | b_reg + +/* Functions for threshold&compress */ +#define check_store(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; \ + incr_val=ch_out_r; } + +#define check_store_4x1(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; } + +#define reset_currThr() \ + if ((uint32_t *) currThr == (uint32_t *) (pThr + ch_out)) { \ + currThr = (v2s *) pThr; \ + } + +#define MacLoads20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define MacLoad20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +/* Functions for compressed min/max */ +#define CompressedMax(res, in1, in2) asm volatile( \ + "pv.max.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define CompressedMin(res, in1, in2) asm volatile( \ + "pv.min.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Max16(res, in1, in2) asm volatile( \ + "pv.max.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Min16(res, in1, in2) asm volatile( \ + "pv.min.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define thr_cmp(state, val, threshs) __builtin_pulp_thresh_compr(state, val, threshs) + +typedef unsigned char v4u __attribute__((vector_size (4))); +typedef signed char v4s __attribute__((vector_size (4))); + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#ifdef __clang__ +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_binsert(dst,not_mask_imm,src,mask_imm,off) +#else +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#endif +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define maxs20(a, b) __builtin_pulp_max20(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define mins20(a, b) __builtin_pulp_min20(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_tnn_zero_mem_ternary(uint8_t * pBuffer, unsigned int size, unsigned int uns) +{ + uint8_t pad_val = 0xd9; + uint32_t pad_vec = 0xd9d9d9d9; + if (uns) { + // if we are using an unsigned kernel, we need to pad with -1 because the hardware will add a +1 to ALL values! + pad_val = 0xff; + pad_vec = 0xffffffff; + } + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u)pad_vec; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=pad_val; + lfover-=4; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_tnn_compare_and_replace_if_larger_ternary(int8_t * base, + int8_t * target, + uint16_t length) +{ + uint8_t mask2 = 0x0c; + uint8_t n_mask2 = ~ mask2; + uint8_t mask4 = 0x30; + uint8_t n_mask4 = ~ mask4; + uint8_t mask6 = 0xc0; + uint8_t n_mask6 = ~ mask6; + uint8_t off2 = 2; + uint8_t off4 = 4; + uint8_t off6 = 6; + + uint8_t *pIn = (uint8_t *) base; + uint8_t *pCom = (uint8_t *) target; + uint8_t *out; + + int cnt = length >> 2; + uint32_t result; + + while(cnt > 0u) + { + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + *((uint32_t *)pIn) = result; + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + if (left>0u) + { + // do the vector max on the whole word - we won't use the leftover bytes + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + + // ...and copy back the relevant bytes of the result to pIn + for (int i=0; i> (8*i)); + + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +#endif diff --git a/rt_nn_tests/xptnn_conv/pulp_nn_utils_xpnn.h b/rt_nn_tests/xptnn_conv/pulp_nn_utils_xpnn.h new file mode 100644 index 0000000..0c783ae --- /dev/null +++ b/rt_nn_tests/xptnn_conv/pulp_nn_utils_xpnn.h @@ -0,0 +1,1937 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include "pmsis.h" +#ifdef GAP_SDK +#include "pulp.h" +#endif + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +#endif diff --git a/rt_nn_tests/xptnn_conv/test.c b/rt_nn_tests/xptnn_conv/test.c new file mode 100644 index 0000000..ea71523 --- /dev/null +++ b/rt_nn_tests/xptnn_conv/test.c @@ -0,0 +1,224 @@ +#include +#include +#include + + +#include "xpulp_tnn_matmul_ternary.h" +#include "xpulp_tnn_matmul_ternary_4x1.h" +#include "xpulp_tnn_conv_ternary.h" +#include "xpulp_tnn_conv_ternary_signed.h" +#ifndef PROFILE // HACKYTIME +#include "xpulp_tnn_conv1d_ternary.h" +#include "xpulp_tnn_conv1d_ternary_signed.h" +#endif +#include "xpulp_tnn_maxpool_ternary.h" +#include "xpulp_tnn_linear_ternary_i32_signed.h" +#include "xpulp_tnn_linear_ternary_i32.h" +#include "pmsis.h" + +#include "data_statstest.h" +//#include "pulp_nn_kernels.h" +#include "pulp_nn_mix_kernels.h" + + +#define start_cycle_counter() asm volatile("csrw 0xCC0, 0x01;") +#define stop_cycle_counter() asm volatile("csrw 0xCC0, 0x00;") +#define read_cycle_counter(x) asm volatile("csrr %0, 0x780;" : "=r" (x)) +#define reset_cycle_counter() asm volatile("csrw 0x780, 0x0;") + +uint8_t im2col[IM2COL_DIM] = {0}; +uint8_t outputs[OUTPUT_DIM] = {0}; + +int32_t outputs_fp[OUTPUT_DIM_FP] = {0}; + +#ifndef PROFILE +int num_cycles; +#else +int im2col_cycles; +int hotloop_prep_cycles; +int hotloop_cycles; +int threshold_cycles; +int requant_cycles; +int hotloop_leftover_cycles; +int matmul4x2_leftover_cycles; +#endif + +void call_krnl_0(); +void test_0(); + +int main(int argc, char *argv[]) +{ +#if KRAKEN_PTEST == 1 + kraken_padframe_aon_pad_gpioa_cfg_rxe_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_trie_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); +#endif + int n_mismatches = 0; + int mismatches_tot = 0; + + + + #ifndef PROFILE + num_cycles = 0; + #endif + + if (get_core_id() == 0) { + printf("===> TEST 0: Running xpulp_tnn_conv_ternary_signed...\n"); + printf(" dims_in = [4, 4]\n"); + printf(" dims_kernel = [3, 3]\n"); + printf(" ch_in/out = [20, 40]\n"); + //printf(" padding_y_top = [%d]\n", padding_y_top); + //printf(" padding_y_bottom = [%d]\n", padding_y_bottom); + //printf(" padding_x_left = [%d]\n", padding_x_left); + //printf(" padding_x_right = [%d]\n", padding_x_right); + //printf(" stride_x = [%d]\n", stride_x); + //printf(" stride_y = [%d]\n", stride_y); + } + test_0(); + #ifndef PROFILE + stop_cycle_counter(); + read_cycle_counter(num_cycles); + if (get_core_id() == 0) { + printf("===> TEST 0: Finished running xpulp_tnn_conv_ternary_signed\n"); + printf("num_cycles = %d\n", num_cycles); + printf("MACs = 115200\n"); + printf("MACs/cycle = %.4f\n", 115200/num_cycles); + } + #endif + if (get_core_id() == 0) { + printf("Checking for mismatches..\n"); + n_mismatches = 0; + + for(int i=0; i < 128; i++) { + if (outputs[i] != exp_outp_0[i]){ + printf("***Mismatch in test 0 at iteration %d: Expected: %x, got: %x\n", i, exp_outp_0[i], outputs[i]); + n_mismatches++; + } + } + } + mismatches_tot += n_mismatches; + + if (get_core_id() == 0) { + printf("Got %d mismatches in %d tests\n", mismatches_tot, 1); + } + return mismatches_tot; +} + + +void call_krnl_0(void) { + uint8_t * pInp; + uint8_t * pIm2ColBuffer; + int8_t * pBias = NULL; + uint8_t * pOut; + int8_t * pWeight; + uint32_t * pThr; + int32_t * pKappa, pLambda; + #ifdef PROFILE + int32_t im2col_cycles = 0; + int32_t hotloop_prep_cycles = 0; + int32_t hotloop_cycles = 0; + int32_t threshold_cycles = 0; + int32_t requant_cycles = 0; + int32_t hotloop_leftover_cycles = 0; + int32_t matmul4x2_leftover_cycles = 0; + #endif + pInp = inp_l1; + + pOut = outp_l1; + pIm2ColBuffer = im2col_l1; + pWeight = wt_l1; + pThr = threshs_l1; + pKappa = kappa_l1; + pLambda = lambda_l1; +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 1); + } +#endif + xpulp_tnn_conv_ternary_signed( + pInp, + pIm2ColBuffer, + pBias, + pOut, + pWeight, + pThr, + 4, + 4, + 20, + 4, + 4, + 40, + 3, + 3, + 1, + 1, + 1, + 1, + 1, +#ifndef PROFILE + 1 +#else + 1, + &im2col_cycles, + &hotloop_prep_cycles, + &hotloop_cycles, + &threshold_cycles, + &hotloop_leftover_cycles, + &matmul4x2_leftover_cycles +#endif + ); + +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); + } +#endif + #ifdef PROFILE + if (pi_core_id() == 0) { + printf("im2col_cycles = %d\n", im2col_cycles); + printf("hotloop_prep_cycles = %d\n", hotloop_prep_cycles); + printf("hotloop_cycles = %d\n", hotloop_cycles); + printf("requant_cycles = %d\n", requant_cycles); + printf("threshold_cycles = %d\n", threshold_cycles); + printf("hotloop_leftover_cycles = %d\n", hotloop_leftover_cycles); + printf("matmul4x2_leftover_cycles = %d\n", matmul4x2_leftover_cycles); + } + #endif +} + +void test_0(void) { + // DMA transfer inputs from L2 to L1 + if (pi_core_id() == 0) { + plp_dma_memcpy(pIn_0, inp_l1, 64, 1); + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + // transfer thresholds + if (pi_core_id() == 0) { + plp_dma_memcpy(pThr_0, threshs_l1, 40 * 4, 1); // 4 bytes per set of 2 thresholds + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + // transfer weights + if (pi_core_id() == 0) { + plp_dma_memcpy(pWeight_0, wt_l1, 1440, 1); + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + call_krnl_0(); + // get outputs back with DMA + if (pi_core_id() == 0) { + plp_dma_memcpy(outputs, outp_l1, 128, 0); + plp_dma_barrier(); + } +} + + + + + + + + + + diff --git a/rt_nn_tests/xptnn_conv/xpulp_tnn_conv1d_ternary.h b/rt_nn_tests/xptnn_conv/xpulp_tnn_conv1d_ternary.h new file mode 100644 index 0000000..f1a08a6 --- /dev/null +++ b/rt_nn_tests/xptnn_conv/xpulp_tnn_conv1d_ternary.h @@ -0,0 +1,35 @@ +#ifndef __XPULP_NN_CONV1D_TERNARY_H +#define __XPULP_NN_CONV1D_TERNARY_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv1d_ternary( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t dilation_x); +#else + uint16_t dilation_x, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_conv/xpulp_tnn_conv1d_ternary_signed.h b/rt_nn_tests/xptnn_conv/xpulp_tnn_conv1d_ternary_signed.h new file mode 100644 index 0000000..0f8ba08 --- /dev/null +++ b/rt_nn_tests/xptnn_conv/xpulp_tnn_conv1d_ternary_signed.h @@ -0,0 +1,35 @@ +#ifndef __XPULP_NN_CONV1D_TERNARY_SIGNED_H +#define __XPULP_NN_CONV1D_TERNARY_SIGNED_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv1d_ternary_signed( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t dilation_x); +#else + uint16_t dilation_x, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_conv/xpulp_tnn_conv_ternary.h b/rt_nn_tests/xptnn_conv/xpulp_tnn_conv_ternary.h new file mode 100644 index 0000000..f4a8759 --- /dev/null +++ b/rt_nn_tests/xptnn_conv/xpulp_tnn_conv_ternary.h @@ -0,0 +1,40 @@ +#ifndef __XPULP_NN_CONV_TERNARY_H +#define __XPULP_NN_CONV_TERNARY_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv_ternary( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t stride_y); +#else + uint16_t stride_y, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_conv/xpulp_tnn_conv_ternary_signed.c b/rt_nn_tests/xptnn_conv/xpulp_tnn_conv_ternary_signed.c new file mode 100644 index 0000000..92ff6cd --- /dev/null +++ b/rt_nn_tests/xptnn_conv/xpulp_tnn_conv_ternary_signed.c @@ -0,0 +1,259 @@ +#include +#include "pmsis.h" +#include "pulp_nn_utils.h" +#include "xpulp_tnn_conv_ternary_signed.h" +#include "xpulp_tnn_matmul_ternary_signed.h" +#include "xpulp_tnn_matmul_ternary_signed_4x1.h" + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv_ternary_signed( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t stride_y) +{ +#else + uint16_t stride_y, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles) +{ + uint32_t im2col_add; + uint32_t hotloop_prep_add; + uint32_t hotloop_add; + uint32_t threshold_add; + uint32_t hotloop_leftover_add; + uint32_t matmul4x2_leftover_add; +#endif + //uint16_t ch_in_r = PACK_INT2_SIZE(ch_in); + uint16_t ch_in_r = ch_in/5; + uint16_t ch_out_r = ch_out/5; + uint16_t ch_in_c = ch_in_r * 4; + uint16_t ch_out_c = ch_out_r * 4; +#ifdef FC_TEST + int core_id = 0; +#else + int core_id = pi_core_id(); +#endif + uint8_t * pIm2ColBase = pIm2ColBuffer + (2 * core_id * ch_in_r * dim_kernel_x * dim_kernel_y); + int i_out_y, i_out_x, i_ker_y, i_ker_x; + int Log2Core; + + uint8_t extra_chunk = ((dim_out_y & (NUM_CORES-1)) != 0); + uint8_t extra_chunk_r; + uint16_t dim_out_x_r; + uint8_t section; + int core_id_r; + + if(extra_chunk && dim_out_x > 1) + { + Log2Core = log2(NUM_CORES >> 1); + core_id_r = (core_id >> 1); + dim_out_x_r = (dim_out_x >> 1); + section = (core_id & 0x1); + extra_chunk_r = ((dim_out_y & ((NUM_CORES >> 1) - 1)) != 0); + } + else + { + Log2Core = log2(NUM_CORES); + core_id_r = core_id; + dim_out_x_r = dim_out_x; + section = 0; + extra_chunk_r = extra_chunk; + extra_chunk = 0; + } + + uint8_t flag_dim_out_x_odd = dim_out_x & 0x01; + + int chunk = (dim_out_y >> Log2Core) + extra_chunk_r; + + int start_pixel = min((chunk * core_id_r), dim_out_y); + int stop_pixel = min(start_pixel + chunk, dim_out_y); + + uint8_t *pIm2Col = pIm2ColBase; + uint8_t *pOutBuffer = pOut + (start_pixel * ch_out_r * dim_out_x) + (section * ch_out_r * dim_out_x_r); + int thrc_res1 = 0, thrc_res2 = 0; + + for (i_out_y = start_pixel; i_out_y < stop_pixel; i_out_y++) + { + for(i_out_x=(section * dim_out_x_r); i_out_x<(dim_out_x_r + (section * (dim_out_x_r + flag_dim_out_x_odd))); i_out_x++) + { + #ifdef PROFILE + //printf("Started im2col counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + if(i_out_y < padding_y_top) + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x=((i_out_x * stride_x) - padding_x_left); i_ker_x<((i_out_x * stride_x) - padding_x_left + dim_kernel_x); i_ker_x++) + { + if((i_ker_y < 0) || (i_ker_y >= dim_in_y) || (i_ker_x < 0) || (i_ker_x >= dim_in_x)) + { + xpulp_tnn_zero_mem_ternary(pIm2Col, ch_in_c, 0); + } + else + { + xpulp_nn_im2col_u2_to_u2((uint8_t*) (pIn + ((i_ker_y * dim_in_x + i_ker_x) * ch_in_r)), pIm2Col, ch_in_c); + } + pIm2Col+=ch_in_r; + } + } + } + else if(i_out_y < dim_out_y - padding_y_bottom) + { + if(i_out_x < padding_x_left) + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x=((i_out_x * stride_x) - padding_x_left); i_ker_x<((i_out_x * stride_x) - padding_x_left + dim_kernel_x); i_ker_x++) + { + if((i_ker_x < 0) || (i_ker_x >= dim_in_x)) + { + xpulp_tnn_zero_mem_ternary(pIm2Col, ch_in_c, 0); + } + else + { + xpulp_nn_im2col_u2_to_u2((uint8_t*) (pIn + ((i_ker_y * dim_in_x + i_ker_x) * ch_in_r)), pIm2Col, ch_in_c); + } + pIm2Col+=ch_in_r; + } + } + } + else if(i_out_x < (dim_out_x - padding_x_right)) + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + xpulp_nn_im2col_u2_to_u2((uint8_t*) pIn + (i_ker_y * dim_in_x + i_out_x * stride_x - padding_x_left)*ch_in_r,pIm2Col,ch_in_c * dim_kernel_x); + pIm2Col+= ch_in_r * dim_kernel_x; + } + } + else + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x = i_out_x * stride_x - padding_x_left; i_ker_x < i_out_x * stride_x - padding_x_left + dim_kernel_x; i_ker_x++) + { + if((i_ker_x < 0) || (i_ker_x >= dim_in_x)) + { + xpulp_tnn_zero_mem_ternary(pIm2Col, ch_in_c, 0); + } + else + { + xpulp_nn_im2col_u2_to_u2((uint8_t *)pIn + (i_ker_y*dim_in_x+i_ker_x)* ch_in_r, pIm2Col, ch_in_c); + } + pIm2Col += ch_in_r; + } + } + } + } + else + { + for(i_ker_y=((i_out_y * stride_y) - padding_y_top); i_ker_y<((i_out_y * stride_y) - padding_y_top + dim_kernel_y); i_ker_y++) + { + for(i_ker_x = i_out_x * stride_x - padding_x_left; i_ker_x < i_out_x * stride_x - padding_x_left + dim_kernel_x; i_ker_x++) + { + if(i_ker_y < 0 || (i_ker_y >= dim_in_y) || i_ker_x < 0 || i_ker_x >= dim_in_x) + { + xpulp_tnn_zero_mem_ternary(pIm2Col, ch_in_c, 0); + } + else + { + xpulp_nn_im2col_u2_to_u2((uint8_t *) pIn + (i_ker_y * dim_in_x + i_ker_x) * ch_in_r, pIm2Col, ch_in_c); + } + pIm2Col += ch_in_r; + } + } + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped im2col counter\n"); + read_cycle_counter(im2col_add); + *im2col_cycles += im2col_add; + #endif + if(pIm2Col == (pIm2ColBase + ((ch_in_r * dim_kernel_x * dim_kernel_y) << 1))) + { + pOutBuffer = xpulp_tnn_matmul_ternary_signed( + pIm2ColBase, + pBias, + pThr, + pOutBuffer, + pOutBuffer + ch_out_r, + pWeight, + (ch_in * dim_kernel_x * dim_kernel_y), + ch_out, + &thrc_res1, + #ifndef PROFILE + &thrc_res2 + #else + &thrc_res2, + hotloop_prep_cycles, + hotloop_cycles, + threshold_cycles, + hotloop_leftover_cycles + #endif + ); + pIm2Col = pIm2ColBase; + } + } + thrc_res1 = 0; + if(pIm2Col != pIm2ColBase) + { + #ifdef PROFILE + //printf("Started matmul4x2_leftover counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + pOutBuffer = xpulp_tnn_matmul_ternary_signed_4x1( + pIm2ColBase, + pBias, + pThr, + pOutBuffer, + pWeight, + (ch_in * dim_kernel_x * dim_kernel_y), + ch_out, +#ifndef PROFILE + &thrc_res1 +#else + &thrc_res1, + hotloop_prep_cycles, + hotloop_leftover_cycles, // everything done by the 4x1 hotloop is "hotloop leftover"! + threshold_cycles, + hotloop_leftover_cycles +#endif + ); + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped matmul4x2_leftover counter\n"); + read_cycle_counter(matmul4x2_leftover_add); + *matmul4x2_leftover_cycles += matmul4x2_leftover_add; + #endif + } + pOutBuffer+=(extra_chunk * ((dim_out_x_r + ((1 - section) * flag_dim_out_x_odd)) * ch_out_r)); + pIm2Col = pIm2ColBase; + } +#ifndef FC_TEST + pi_cl_team_barrier(0); +#endif +} diff --git a/rt_nn_tests/xptnn_conv/xpulp_tnn_conv_ternary_signed.h b/rt_nn_tests/xptnn_conv/xpulp_tnn_conv_ternary_signed.h new file mode 100644 index 0000000..d05c2ab --- /dev/null +++ b/rt_nn_tests/xptnn_conv/xpulp_tnn_conv_ternary_signed.h @@ -0,0 +1,40 @@ +#ifndef __XPULP_NN_CONV_TERNARY_SIGNED_H +#define __XPULP_NN_CONV_TERNARY_SIGNED_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv_ternary_signed( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t stride_y); +#else + uint16_t stride_y, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_conv/xpulp_tnn_linear_ternary_i32.h b/rt_nn_tests/xptnn_conv/xpulp_tnn_linear_ternary_i32.h new file mode 100644 index 0000000..5c353d3 --- /dev/null +++ b/rt_nn_tests/xptnn_conv/xpulp_tnn_linear_ternary_i32.h @@ -0,0 +1,12 @@ +#ifndef __XPULP_TNN_LINEAR_TERNARY_I32_H +#define __XPULP_TNN_LINEAR_TERNARY_I32_H + +void __attribute__((noinline)) xpulp_tnn_linear_ternary_i32( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +#endif \ No newline at end of file diff --git a/rt_nn_tests/xptnn_conv/xpulp_tnn_linear_ternary_i32_signed.h b/rt_nn_tests/xptnn_conv/xpulp_tnn_linear_ternary_i32_signed.h new file mode 100644 index 0000000..218c5c4 --- /dev/null +++ b/rt_nn_tests/xptnn_conv/xpulp_tnn_linear_ternary_i32_signed.h @@ -0,0 +1,12 @@ +#ifndef __XPULP_TNN_LINEAR_TERNARY_I32_SIGNED_H +#define __XPULP_TNN_LINEAR_TERNARY_I32_SIGNED_H + +void __attribute__((noinline)) xpulp_tnn_linear_ternary_i32_signed( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +#endif \ No newline at end of file diff --git a/rt_nn_tests/xptnn_conv/xpulp_tnn_matmul_ternary.h b/rt_nn_tests/xptnn_conv/xpulp_tnn_matmul_ternary.h new file mode 100644 index 0000000..914daee --- /dev/null +++ b/rt_nn_tests/xptnn_conv/xpulp_tnn_matmul_ternary.h @@ -0,0 +1,113 @@ +#ifndef __XPULP_TNN_MATMUL_TERNARY_H +#define __XPULP_TNN_MATMUL_TERNARY_H + +#include "pulp_nn_utils.h" + +uint8_t * __attribute__((noinline)) xpulp_tnn_matmul_ternary( + uint8_t *pIn, + int8_t *pBias, + uint32_t *pThr, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pWeight, + uint16_t num_col_im2col, + uint16_t ch_out, +#ifndef PROFILE + uint32_t *thrc_res1, + uint32_t *thrc_res2); +#else + uint32_t *thrc_res1, + uint32_t *thrc_res2, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles); +#endif + +inline void __attribute__((aligned(4))) hotloop_4x2( + int *sum, int *sum2, int *sum3, int *sum4, int *sum5, int *sum6, int *sum7, int *sum8, + uint32_t *ptrA, uint32_t *ptrA2, uint32_t *ptrA3, uint32_t *ptrA4, + uint32_t *ptrB, uint32_t *ptrB2, int num_col_im2col_words +){ + for (int j=0; j +#include +#include "pulp_nn_utils.h" +#include "xpulp_tnn_matmul_ternary_signed.h" + + + +// TODO: review argument order +uint8_t * __attribute__((noinline)) xpulp_tnn_matmul_ternary_signed( + uint8_t *pIn, + int8_t *pBias, + uint32_t *pThr, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pWeight, + uint16_t num_col_im2col, + uint16_t ch_out, +#ifndef PROFILE + uint32_t *thrc_res1, + uint32_t *thrc_res2) +{ +#else + uint32_t *thrc_res1, + uint32_t *thrc_res2, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles) +{ + uint32_t hotloop_prep_add; + uint32_t hotloop_add; + uint32_t threshold_add; + uint32_t hotloop_leftover_add; +#endif + uint16_t ch_out_r = ch_out / 5; + + uint16_t num_col_im2col_w = num_col_im2col / 5; + uint16_t num_col_im2col_a = num_col_im2col_w; + uint16_t num_col_im2col_c = num_col_im2col_w << 2; + // used for hotloop calls + uint16_t num_col_im2col_words = num_col_im2col_w >> 2; + + uint8_t *pA = pWeight; + v2s *currThr = (v2s *) pThr; + + int res1, res2, incr_val; + res1 = *thrc_res1; + res2 = *thrc_res2; + incr_val = 0; + + for(int i=0; i < ch_out >> 2; i++) + { + #ifdef PROFILE + //printf("Started hotloop_prep counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + uint8_t *pB = pIn; + uint8_t *pB2 = (pB + num_col_im2col_a); + + uint32_t *ptrB = (uint32_t *) pB; + uint32_t *ptrB2 = (uint32_t *) pB2; + + uint8_t *pA2 = (pA + num_col_im2col_w); + uint8_t *pA3 = (pA2 + num_col_im2col_w); + uint8_t *pA4 = (pA3 + num_col_im2col_w); + + uint32_t *ptrA = (uint32_t *) pA ; + uint32_t *ptrA2 = (uint32_t *) pA2; + uint32_t *ptrA3 = (uint32_t *) pA3; + uint32_t *ptrA4 = (uint32_t *) pA4; + + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + ptrA2 = MacLoadInit(1, 0, 1, 0, ptrA2); + ptrA3 = MacLoadInit(1, 0, 2, 0, ptrA3); + ptrA4 = MacLoadInit(1, 0, 3, 0, ptrA4); + + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + ptrB2 = MacLoadInit(0, 1, 0, 1, ptrB2); + + int sum = 0; + int sum2 = 0; + int sum3 = 0; + int sum4 = 0; + int sum5 = 0; + int sum6 = 0; + int sum7 = 0; + int sum8 = 0; + + if (pBias != NULL) + { + sum = ((int) (*pBias++)); + sum2 = ((int) (*pBias++)); + sum3 = ((int) (*pBias++)); + sum4 = ((int) (*pBias++)); + + sum5 = sum; + sum6 = sum2; + sum7 = sum3; + sum8 = sum4; + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_prep counter\n"); + read_cycle_counter(hotloop_prep_add); + *hotloop_prep_cycles += hotloop_prep_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + + hotloop_4x2( + &sum, &sum2, &sum3, &sum4, &sum5, &sum6, &sum7, &sum8, + ptrA, ptrA2, ptrA3, ptrA4, ptrB, ptrB2, num_col_im2col_words + ); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop counter\n"); + read_cycle_counter(hotloop_add); + *hotloop_cycles += hotloop_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop_leftover counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + int col_cnt_im2col = num_col_im2col_c & 0xf; + + if (col_cnt_im2col) + { + uint16_t loop_cnt_im2col_w = (num_col_im2col_words) << 2; + pA+=loop_cnt_im2col_w; + pA2+=loop_cnt_im2col_w; + pA3+=loop_cnt_im2col_w; + pA4+=loop_cnt_im2col_w; + + uint16_t loop_cnt_im2col_a = (num_col_im2col_words) << 2; + pB+=loop_cnt_im2col_a; + pB2+=loop_cnt_im2col_a; + + volatile uint32_t valA, valA2, valA3, valA4, valB, valB2; + // pack the remaining weights and activations into 32-bit vectors + // padding with 0xd9 because ternary_decoder(0xd9) = 0000000000 + if (col_cnt_im2col == 4) + { + valA = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA2; + valA3 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA3; + valA4 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA4; + + valB = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB; + valB2 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB2; + } + else if (col_cnt_im2col == 8) + { + valA = 0xd9 << 24 | 0xd9 << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | *(pA2 + 1) << 8 | *pA2; + valA3 = 0xd9 << 24 | 0xd9 << 16 | *(pA3 + 1) << 8 | *pA3; + valA4 = 0xd9 << 24 | 0xd9 << 16 | *(pA4 + 1) << 8 | *pA4; + + valB = 0xd9 << 24 | 0xd9 << 16 | *(pB + 1) << 8 | *pB; + valB2 = 0xd9 << 24 | 0xd9 << 16 | *(pB2 + 1) << 8 | *pB2; + } + else // col_cnt_im2col == 12 + { + valA = 0xd9 << 24 | *(pA + 2) << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | *(pA2 + 2) << 16 | *(pA2 + 1) << 8 | *pA2; + valA3 = 0xd9 << 24 | *(pA3 + 2) << 16 | *(pA3 + 1) << 8 | *pA3; + valA4 = 0xd9 << 24 | *(pA4 + 2) << 16 | *(pA4 + 1) << 8 | *pA4; + + valB = 0xd9 << 24 | *(pB + 2) << 16 | *(pB + 1) << 8 | *pB; + valB2 = 0xd9 << 24 | *(pB2 + 2) << 16 | *(pB2 + 1) << 8 | *pB2; + } + + uint32_t *pA_p = &valA; + uint32_t *pA2_p = &valA2; + uint32_t *pA3_p = &valA3; + uint32_t *pA4_p = &valA4; + + uint32_t *pB_p = &valB; + uint32_t *pB2_p = &valB2; + + pA_p = MacLoadInit(1, 0, 0, 0, pA_p); + pA2_p = MacLoadInit(1, 0, 1, 0, pA2_p); + pA3_p = MacLoadInit(1, 0, 2, 0, pA3_p); + pA4_p = MacLoadInit(1, 0, 3, 0, pA4_p); + pB_p = MacLoadInit(0, 1, 0, 0, pB_p); + pB2_p = MacLoadInit(0, 1, 0, 1, pB2_p); + + pA += PACK_INT2_SIZE(col_cnt_im2col); + + sum = MacLoads20(0, 0, 0, 0, pA_p, sum); + sum2 = MacLoads20(0, 0, 1, 0, pA2_p, sum2); + sum3 = MacLoads20(0, 0, 2, 0, pA3_p, sum3); + sum4 = MacLoads20(0, 0, 3, 0, pA4_p, sum4); + + sum5 = MacLoads20(0, 0, 0, 1, pA_p, sum5); + sum6 = MacLoads20(0, 0, 1, 1, pA2_p, sum6); + sum7 = MacLoads20(0, 0, 2, 1, pA3_p, sum7); + sum8 = MacLoads20(0, 0, 3, 1, pA4_p, sum8); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_leftover counter\n"); + read_cycle_counter(hotloop_leftover_add); + *hotloop_leftover_cycles += hotloop_leftover_add; + #endif + + #ifdef PROFILE + //printf("Started threshold counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + res1 = thr_cmp(res1, sum, *currThr); + check_store(res1, pOut); + res2 = thr_cmp(res2, sum5, *currThr++); + check_store(res2, pOut2); + reset_currThr(); + + res1 = thr_cmp(res1, sum2, *currThr); + check_store(res1, pOut); + res2 = thr_cmp(res2, sum6, *currThr++); + check_store(res2, pOut2); + reset_currThr(); + + res1 = thr_cmp(res1, sum3, *currThr); + check_store(res1, pOut); + res2 = thr_cmp(res2, sum7, *currThr++); + check_store(res2, pOut2); + reset_currThr(); + + res1 = thr_cmp(res1, sum4, *currThr); + check_store(res1, pOut); + res2 = thr_cmp(res2, sum8, *currThr++); + check_store(res2, pOut2); + reset_currThr(); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped threshold counter\n"); + read_cycle_counter(threshold_add); + *threshold_cycles += threshold_add; + #endif + + if (!col_cnt_im2col) + { + pA+=(4*num_col_im2col_w); + } + else + { + pA+=(3*num_col_im2col_w); + } + } + + // leftover part : the hotloop above produces 4N output channels. If out_ch not divisible + // by 4, the remaining output channels are computed below + int out_ch_left = ch_out & 0x3; + + if (out_ch_left == 1) + { + #ifdef PROFILE + //printf("Started hotloop_prep counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + uint8_t *pB = pIn; + uint8_t *pB2 = (pB + num_col_im2col_a); + + uint32_t *ptrB = (uint32_t *) pB; + uint32_t *ptrB2 = (uint32_t *) pB2; + + uint32_t *ptrA = (uint32_t *) pA ; + + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + ptrB2 = MacLoadInit(0, 1, 0, 1, ptrB2); + + int sum = 0; + int sum2 = 0; + + if (pBias != NULL) + { + sum = ((int) (*pBias++)); + sum2 = sum; + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_prep counter\n"); + read_cycle_counter(hotloop_prep_add); + *hotloop_prep_cycles += hotloop_prep_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + + hotloop_1x2( + &sum, &sum2, ptrA, ptrB, ptrB2, num_col_im2col_words + ); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop counter\n"); + read_cycle_counter(hotloop_add); + *hotloop_cycles += hotloop_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop_leftover counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + int col_cnt_im2col = num_col_im2col_c & 0xf; + + if (col_cnt_im2col) + { + uint16_t loop_cnt_im2col_w = (num_col_im2col_words) << 2; + pA+=loop_cnt_im2col_w; + + uint16_t loop_cnt_im2col_a = (num_col_im2col_words) << 2; + pB+=loop_cnt_im2col_a; + pB2+=loop_cnt_im2col_a; + + volatile uint32_t valA, valB, valB2; + // pack the remaining weights and activations into 32-bit vectors + // padding with 0xd9 because ternary_decoder(0xd9) = 0000000000 + if (col_cnt_im2col == 4) + { + valA = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA; + + valB = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB; + valB2 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB2; + } + else if (col_cnt_im2col == 8) + { + valA = 0xd9 << 24 | 0xd9 << 16 | *(pA + 1) << 8 | *pA; + + valB = 0xd9 << 24 | 0xd9 << 16 | *(pB + 1) << 8 | *pB; + valB2 = 0xd9 << 24 | 0xd9 << 16 | *(pB2 + 1) << 8 | *pB2; + } + else // col_cnt_im2col == 12 + { + valA = 0xd9 << 24 | *(pA + 2) << 16 | *(pA + 1) << 8 | *pA; + + valB = 0xd9 << 24 | *(pB + 2) << 16 | *(pB + 1) << 8 | *pB; + valB2 = 0xd9 << 24 | *(pB2 + 2) << 16 | *(pB2 + 1) << 8 | *pB2; + } + + uint32_t *pA_p = &valA; + + uint32_t *pB_p = &valB; + uint32_t *pB2_p = &valB2; + + pA_p = MacLoadInit(1, 0, 0, 0, pA_p); + pB_p = MacLoadInit(0, 1, 0, 0, pB_p); + pB2_p = MacLoadInit(0, 1, 0, 1, pB2_p); + + pA += PACK_INT2_SIZE(col_cnt_im2col); + + sum = MacLoads20(0, 0, 0, 0, pA_p, sum); + + sum2 = MacLoads20(0, 0, 0, 1, pA_p, sum2); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_leftover counter\n"); + read_cycle_counter(hotloop_leftover_add); + *hotloop_leftover_cycles += hotloop_leftover_add; + #endif + + #ifdef PROFILE + //printf("Started threshold counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + res1 = thr_cmp(res1, sum, *currThr); + check_store(res1, pOut); + res2 = thr_cmp(res2, sum2, *currThr++); + check_store(res2, pOut2); + reset_currThr(); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped threshold counter\n"); + read_cycle_counter(threshold_add); + *threshold_cycles += threshold_add; + #endif + + if (!col_cnt_im2col) + { + pA+=num_col_im2col_w; + } + } + else if (out_ch_left == 2) + { + #ifdef PROFILE + //printf("Started hotloop_prep counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + uint8_t *pB = pIn; + uint8_t *pB2 = (pB + num_col_im2col_a); + + uint32_t *ptrB = (uint32_t *) pB; + uint32_t *ptrB2 = (uint32_t *) pB2; + + uint8_t *pA2 = (pA + num_col_im2col_w); + + uint32_t *ptrA = (uint32_t *) pA ; + uint32_t *ptrA2 = (uint32_t *) pA2; + + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + ptrA2 = MacLoadInit(1, 0, 1, 0, ptrA2); + + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + ptrB2 = MacLoadInit(0, 1, 0, 1, ptrB2); + + int sum = 0; + int sum2 = 0; + int sum3 = 0; + int sum4 = 0; + + if (pBias != NULL) + { + sum = ((int) (*pBias++)); + sum2 = ((int) (*pBias++)); + + sum3 = sum; + sum4 = sum2; + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_prep counter\n"); + read_cycle_counter(hotloop_prep_add); + *hotloop_prep_cycles += hotloop_prep_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + + hotloop_2x2( + &sum, &sum2, &sum3, &sum4, ptrA, ptrA2, ptrB, ptrB2, num_col_im2col_words + ); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop counter\n"); + read_cycle_counter(hotloop_add); + *hotloop_cycles += hotloop_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop_leftover counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + int col_cnt_im2col = num_col_im2col_c & 0xf; + + if (col_cnt_im2col) + { + uint16_t loop_cnt_im2col_w = (num_col_im2col_words) << 2; + pA+=loop_cnt_im2col_w; + pA2+=loop_cnt_im2col_w; + + uint16_t loop_cnt_im2col_a = (num_col_im2col_words) << 2; + pB+=loop_cnt_im2col_a; + pB2+=loop_cnt_im2col_a; + + volatile uint32_t valA, valA2, valB, valB2; + // pack the remaining weights and activations into 32-bit vectors + // padding with 0xd9 because ternary_decoder(0xd9) = 0000000000 + if (col_cnt_im2col == 4) + { + valA = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA2; + + valB = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB; + valB2 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB2; + } + else if (col_cnt_im2col == 8) + { + valA = 0xd9 << 24 | 0xd9 << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | *(pA2 + 1) << 8 | *pA2; + + valB = 0xd9 << 24 | 0xd9 << 16 | *(pB + 1) << 8 | *pB; + valB2 = 0xd9 << 24 | 0xd9 << 16 | *(pB2 + 1) << 8 | *pB2; + } + else // col_cnt_im2col == 12 + { + valA = 0xd9 << 24 | *(pA + 2) << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | *(pA2 + 2) << 16 | *(pA2 + 1) << 8 | *pA2; + + valB = 0xd9 << 24 | *(pB + 2) << 16 | *(pB + 1) << 8 | *pB; + valB2 = 0xd9 << 24 | *(pB2 + 2) << 16 | *(pB2 + 1) << 8 | *pB2; + } + + uint32_t *pA_p = &valA; + uint32_t *pA2_p = &valA2; + + uint32_t *pB_p = &valB; + uint32_t *pB2_p = &valB2; + + pA_p = MacLoadInit(1, 0, 0, 0, pA_p); + pA2_p = MacLoadInit(1, 0, 1, 0, pA2_p); + pB_p = MacLoadInit(0, 1, 0, 0, pB_p); + pB2_p = MacLoadInit(0, 1, 0, 1, pB2_p); + + pA += PACK_INT2_SIZE(col_cnt_im2col); + + sum = MacLoads20(0, 0, 0, 0, pA_p, sum); + sum2 = MacLoads20(0, 0, 1, 0, pA2_p, sum2); + + sum3 = MacLoads20(0, 0, 0, 1, pA_p, sum3); + sum4 = MacLoads20(0, 0, 1, 1, pA2_p, sum4); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_leftover counter\n"); + read_cycle_counter(hotloop_leftover_add); + *hotloop_leftover_cycles += hotloop_leftover_add; + #endif + + #ifdef PROFILE + //printf("Started threshold counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + res1 = thr_cmp(res1, sum, *currThr); + check_store(res1, pOut); + res2 = thr_cmp(res2, sum3, *currThr++); + check_store(res2, pOut2); + reset_currThr(); + + res1 = thr_cmp(res1, sum2, *currThr); + check_store(res1, pOut); + res2 = thr_cmp(res2, sum4, *currThr++); + check_store(res2, pOut2); + reset_currThr(); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped threshold counter\n"); + read_cycle_counter(threshold_add); + *threshold_cycles += threshold_add; + #endif + + if (!col_cnt_im2col) + { + pA+=(2*num_col_im2col_w); + } + else + { + pA+=num_col_im2col_w; + } + } + else if (out_ch_left == 3) + { + #ifdef PROFILE + //printf("Started hotloop_prep counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + uint8_t *pB = pIn; + uint8_t *pB2 = (pB + num_col_im2col_a); + + uint32_t *ptrB = (uint32_t *) pB; + uint32_t *ptrB2 = (uint32_t *) pB2; + + uint8_t *pA2 = (pA + num_col_im2col_w); + uint8_t *pA3 = (pA2 + num_col_im2col_w); + + uint32_t *ptrA = (uint32_t *) pA ; + uint32_t *ptrA2 = (uint32_t *) pA2; + uint32_t *ptrA3 = (uint32_t *) pA3; + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + ptrA2 = MacLoadInit(1, 0, 1, 0, ptrA2); + ptrA3 = MacLoadInit(1, 0, 2, 0, ptrA3); + + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + ptrB2 = MacLoadInit(0, 1, 0, 1, ptrB2); + + int sum = 0; + int sum2 = 0; + int sum3 = 0; + int sum4 = 0; + int sum5 = 0; + int sum6 = 0; + + if (pBias != NULL) + { + sum = ((int) (*pBias++)); + sum2 = ((int) (*pBias++)); + sum3 = ((int) (*pBias++)); + + sum4 = sum; + sum5 = sum2; + sum6 = sum3; + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_prep counter\n"); + read_cycle_counter(hotloop_prep_add); + *hotloop_prep_cycles += hotloop_prep_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + + hotloop_3x2( + &sum, &sum2, &sum3, &sum4, &sum5, &sum6, + ptrA, ptrA2, ptrA3, ptrB, ptrB2, num_col_im2col_words + ); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop counter\n"); + read_cycle_counter(hotloop_add); + *hotloop_cycles += hotloop_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop_leftover counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + int col_cnt_im2col = num_col_im2col_c & 0xf; + + if (col_cnt_im2col) + { + uint16_t loop_cnt_im2col_w = (num_col_im2col_words) << 2; + pA+=loop_cnt_im2col_w; + pA2+=loop_cnt_im2col_w; + pA3+=loop_cnt_im2col_w; + + uint16_t loop_cnt_im2col_a = (num_col_im2col_words) << 2; + pB+=loop_cnt_im2col_a; + pB2+=loop_cnt_im2col_a; + + volatile uint32_t valA, valA2, valA3, valB, valB2; + // pack the remaining weights and activations into 32-bit vectors + // padding with 0xd9 because ternary_decoder(0xd9) = 0000000000 + if (col_cnt_im2col == 4) + { + valA = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA2; + valA3 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA3; + + valB = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB; + valB2 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB2; + } + else if (col_cnt_im2col == 8) + { + valA = 0xd9 << 24 | 0xd9 << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | *(pA2 + 1) << 8 | *pA2; + valA3 = 0xd9 << 24 | 0xd9 << 16 | *(pA3 + 1) << 8 | *pA3; + + valB = 0xd9 << 24 | 0xd9 << 16 | *(pB + 1) << 8 | *pB; + valB2 = 0xd9 << 24 | 0xd9 << 16 | *(pB2 + 1) << 8 | *pB2; + } + else // col_cnt_im2col == 12 + { + valA = 0xd9 << 24 | *(pA + 2) << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | *(pA2 + 2) << 16 | *(pA2 + 1) << 8 | *pA2; + valA3 = 0xd9 << 24 | *(pA3 + 2) << 16 | *(pA3 + 1) << 8 | *pA3; + + valB = 0xd9 << 24 | *(pB + 2) << 16 | *(pB + 1) << 8 | *pB; + valB2 = 0xd9 << 24 | *(pB2 + 2) << 16 | *(pB2 + 1) << 8 | *pB2; + } + + uint32_t *pA_p = &valA; + uint32_t *pA2_p = &valA2; + uint32_t *pA3_p = &valA3; + + uint32_t *pB_p = &valB; + uint32_t *pB2_p = &valB2; + + pA_p = MacLoadInit(1, 0, 0, 0, pA_p); + pA2_p = MacLoadInit(1, 0, 1, 0, pA2_p); + pA3_p = MacLoadInit(1, 0, 2, 0, pA3_p); + pB_p = MacLoadInit(0, 1, 0, 0, pB_p); + pB2_p = MacLoadInit(0, 1, 0, 1, pB2_p); + + pA += PACK_INT2_SIZE(col_cnt_im2col); + + sum = MacLoads20(0, 0, 0, 0, pA_p, sum); + sum2 = MacLoads20(0, 0, 1, 0, pA2_p, sum2); + sum3 = MacLoads20(0, 0, 2, 0, pA3_p, sum3); + + sum4 = MacLoads20(0, 0, 0, 1, pA_p, sum4); + sum5 = MacLoads20(0, 0, 1, 1, pA2_p, sum5); + sum6 = MacLoads20(0, 0, 2, 1, pA3_p, sum6); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_leftover counter\n"); + read_cycle_counter(hotloop_leftover_add); + *hotloop_leftover_cycles += hotloop_leftover_add; + #endif + + #ifdef PROFILE + //printf("Started threshold counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + res1 = thr_cmp(res1, sum, *currThr); + check_store(res1, pOut); + res2 = thr_cmp(res2, sum4, *currThr++); + check_store(res2, pOut2); + reset_currThr(); + + res1 = thr_cmp(res1, sum2, *currThr); + check_store(res1, pOut); + res2 = thr_cmp(res2, sum5, *currThr++); + check_store(res2, pOut2); + reset_currThr(); + + res1 = thr_cmp(res1, sum3, *currThr); + check_store(res1, pOut); + res2 = thr_cmp(res2, sum6, *currThr++); + check_store(res2, pOut2); + reset_currThr(); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped threshold counter\n"); + read_cycle_counter(threshold_add); + *threshold_cycles += threshold_add; + #endif + + if (!col_cnt_im2col) + { + pA+=(3*num_col_im2col_w); + } + else + { + pA+=(2*num_col_im2col_w); + } + } + + *thrc_res1 = res1; + *thrc_res2 = res2; + + pOut+=incr_val; // ch_out_r if a store was performed, else 0 + return pOut; +} + + + + diff --git a/rt_nn_tests/xptnn_conv/xpulp_tnn_matmul_ternary_signed.h b/rt_nn_tests/xptnn_conv/xpulp_tnn_matmul_ternary_signed.h new file mode 100644 index 0000000..1493b5d --- /dev/null +++ b/rt_nn_tests/xptnn_conv/xpulp_tnn_matmul_ternary_signed.h @@ -0,0 +1,113 @@ +#ifndef __XPULP_TNN_MATMUL_TERNARY_SIGNED_H +#define __XPULP_TNN_MATMUL_TERNARY_SIGNED_H + +#include "pulp_nn_utils.h" + +uint8_t * __attribute__((noinline)) xpulp_tnn_matmul_ternary_signed( + uint8_t *pIn, + int8_t *pBias, + uint32_t *pThr, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pWeight, + uint16_t num_col_im2col, + uint16_t ch_out, +#ifndef PROFILE + uint32_t *thrc_res1, + uint32_t *thrc_res2); +#else + uint32_t *thrc_res1, + uint32_t *thrc_res2, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles); +#endif + +inline void __attribute__((aligned(4))) hotloop_4x2( + int *sum, int *sum2, int *sum3, int *sum4, int *sum5, int *sum6, int *sum7, int *sum8, + uint32_t *ptrA, uint32_t *ptrA2, uint32_t *ptrA3, uint32_t *ptrA4, + uint32_t *ptrB, uint32_t *ptrB2, int num_col_im2col_words +){ + for (int j=0; j +#include +#include "pulp_nn_utils.h" +#include "xpulp_tnn_matmul_ternary_signed_4x1.h" + + + +// TODO: review argument order +uint8_t * __attribute__((noinline)) xpulp_tnn_matmul_ternary_signed_4x1( + uint8_t *pIn, + int8_t *pBias, + uint32_t *pThr, + uint8_t *pOut, + uint8_t *pWeight, + uint16_t num_col_im2col, + uint16_t ch_out, +#ifndef PROFILE + uint32_t *thrc_res) +{ +#else + uint32_t *thrc_res, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles) +{ + uint32_t hotloop_prep_add; + uint32_t hotloop_add; + uint32_t threshold_add; + uint32_t hotloop_leftover_add; +#endif + uint16_t ch_out_r = ch_out / 5; + + uint16_t num_col_im2col_w = num_col_im2col / 5; + uint16_t num_col_im2col_a = num_col_im2col_w; + uint16_t num_col_im2col_c = num_col_im2col_w << 2; + // used for hotloop calls + uint16_t num_col_im2col_words = num_col_im2col_w >> 2; + + uint8_t *pA = pWeight; + v2s *currThr = (v2s *) pThr; + + int res; + res = *thrc_res; + + for(int i=0; i < ch_out >> 2; i++) + { + #ifdef PROFILE + //printf("Started hotloop_prep counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + uint8_t *pB = pIn; + + uint32_t *ptrB = (uint32_t *) pB; + + uint8_t *pA2 = (pA + num_col_im2col_w); + uint8_t *pA3 = (pA2 + num_col_im2col_w); + uint8_t *pA4 = (pA3 + num_col_im2col_w); + + uint32_t *ptrA = (uint32_t *) pA ; + uint32_t *ptrA2 = (uint32_t *) pA2; + uint32_t *ptrA3 = (uint32_t *) pA3; + uint32_t *ptrA4 = (uint32_t *) pA4; + + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + ptrA2 = MacLoadInit(1, 0, 1, 0, ptrA2); + ptrA3 = MacLoadInit(1, 0, 2, 0, ptrA3); + ptrA4 = MacLoadInit(1, 0, 3, 0, ptrA4); + + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + + int sum = 0; + int sum2 = 0; + int sum3 = 0; + int sum4 = 0; + + if (pBias != NULL) + { + sum = ((int) (*pBias++)); + sum2 = ((int) (*pBias++)); + sum3 = ((int) (*pBias++)); + sum4 = ((int) (*pBias++)); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_prep counter\n"); + read_cycle_counter(hotloop_prep_add); + *hotloop_prep_cycles += hotloop_prep_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + + hotloop_4x1( + &sum, &sum2, &sum3, &sum4, ptrA, ptrA2, ptrA3, ptrA4, ptrB, num_col_im2col_words + ); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop counter\n"); + read_cycle_counter(hotloop_add); + *hotloop_cycles += hotloop_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop_leftover counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + int col_cnt_im2col = num_col_im2col_c & 0xf; + + if (col_cnt_im2col) + { + uint16_t loop_cnt_im2col_w = (num_col_im2col_words) << 2; + pA+=loop_cnt_im2col_w; + pA2+=loop_cnt_im2col_w; + pA3+=loop_cnt_im2col_w; + pA4+=loop_cnt_im2col_w; + + uint16_t loop_cnt_im2col_a = (num_col_im2col_words) << 2; + pB+=loop_cnt_im2col_a; + + volatile uint32_t valA, valA2, valA3, valA4, valB; + // pack the remaining weights and activations into 32-bit vectors + // padding with 0xd9 because ternary_decoder(0xd9) = 0000000000 + if (col_cnt_im2col == 4) + { + valA = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA2; + valA3 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA3; + valA4 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA4; + + valB = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB; + } + else if (col_cnt_im2col == 8) + { + valA = 0xd9 << 24 | 0xd9 << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | *(pA2 + 1) << 8 | *pA2; + valA3 = 0xd9 << 24 | 0xd9 << 16 | *(pA3 + 1) << 8 | *pA3; + valA4 = 0xd9 << 24 | 0xd9 << 16 | *(pA4 + 1) << 8 | *pA4; + + valB = 0xd9 << 24 | 0xd9 << 16 | *(pB + 1) << 8 | *pB; + } + else // col_cnt_im2col == 12 + { + valA = 0xd9 << 24 | *(pA + 2) << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | *(pA2 + 2) << 16 | *(pA2 + 1) << 8 | *pA2; + valA3 = 0xd9 << 24 | *(pA3 + 2) << 16 | *(pA3 + 1) << 8 | *pA3; + valA4 = 0xd9 << 24 | *(pA4 + 2) << 16 | *(pA4 + 1) << 8 | *pA4; + + valB = 0xd9 << 24 | *(pB + 2) << 16 | *(pB + 1) << 8 | *pB; + } + + uint32_t *pA_p = &valA; + uint32_t *pA2_p = &valA2; + uint32_t *pA3_p = &valA3; + uint32_t *pA4_p = &valA4; + + uint32_t *pB_p = &valB; + + pA_p = MacLoadInit(1, 0, 0, 0, pA_p); + pA2_p = MacLoadInit(1, 0, 1, 0, pA2_p); + pA3_p = MacLoadInit(1, 0, 2, 0, pA3_p); + pA4_p = MacLoadInit(1, 0, 3, 0, pA4_p); + pB_p = MacLoadInit(0, 1, 0, 0, pB_p); + + pA += PACK_INT2_SIZE(col_cnt_im2col); + + sum = MacLoads20(0, 0, 0, 0, pA_p, sum); + sum2 = MacLoads20(0, 0, 1, 0, pA2_p, sum2); + sum3 = MacLoads20(0, 0, 2, 0, pA3_p, sum3); + sum4 = MacLoads20(0, 0, 3, 0, pA4_p, sum4); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_leftover counter\n"); + read_cycle_counter(hotloop_leftover_add); + *hotloop_leftover_cycles += hotloop_leftover_add; + #endif + + #ifdef PROFILE + //printf("Started threshold counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + res = thr_cmp(res, sum, *currThr++); + check_store_4x1(res, pOut); + reset_currThr(); + + res = thr_cmp(res, sum2, *currThr++); + check_store_4x1(res, pOut); + reset_currThr(); + + res = thr_cmp(res, sum3, *currThr++); + check_store_4x1(res, pOut); + reset_currThr(); + + res = thr_cmp(res, sum4, *currThr++); + check_store_4x1(res, pOut); + reset_currThr(); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped threshold counter\n"); + read_cycle_counter(threshold_add); + *threshold_cycles += threshold_add; + #endif + + if (!col_cnt_im2col) + { + pA+=(4*num_col_im2col_w); + } + else + { + pA+=(3*num_col_im2col_w); + } + } + + // leftover part : the hotloop above produces 4N output channels. If out_ch not divisible + // by 4, the remaining output channels are computed below + int out_ch_left = ch_out & 0x3; + + if (out_ch_left == 1) + { + #ifdef PROFILE + //printf("Started hotloop_prep counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + uint8_t *pB = pIn; + + uint32_t *ptrB = (uint32_t *) pB; + + uint32_t *ptrA = (uint32_t *) pA ; + + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + + int sum = 0; + + if (pBias != NULL) + { + sum = ((int) (*pBias++)); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_prep counter\n"); + read_cycle_counter(hotloop_prep_add); + *hotloop_prep_cycles += hotloop_prep_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + + hotloop_1x1( + &sum, ptrA, ptrB, num_col_im2col_words + ); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop counter\n"); + read_cycle_counter(hotloop_add); + *hotloop_cycles += hotloop_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop_leftover counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + int col_cnt_im2col = num_col_im2col_c & 0xf; + + if (col_cnt_im2col) + { + uint16_t loop_cnt_im2col_w = (num_col_im2col_words) << 2; + pA+=loop_cnt_im2col_w; + + uint16_t loop_cnt_im2col_a = (num_col_im2col_words) << 2; + pB+=loop_cnt_im2col_a; + + volatile uint32_t valA, valB; + // pack the remaining weights and activations into 32-bit vectors + // padding with 0xd9 because ternary_decoder(0xd9) = 0000000000 + if (col_cnt_im2col == 4) + { + valA = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA; + + valB = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB; + } + else if (col_cnt_im2col == 8) + { + valA = 0xd9 << 24 | 0xd9 << 16 | *(pA + 1) << 8 | *pA; + + valB = 0xd9 << 24 | 0xd9 << 16 | *(pB + 1) << 8 | *pB; + } + else // col_cnt_im2col == 12 + { + valA = 0xd9 << 24 | *(pA + 2) << 16 | *(pA + 1) << 8 | *pA; + + valB = 0xd9 << 24 | *(pB + 2) << 16 | *(pB + 1) << 8 | *pB; + } + + uint32_t *pA_p = &valA; + + uint32_t *pB_p = &valB; + + pA_p = MacLoadInit(1, 0, 0, 0, pA_p); + pB_p = MacLoadInit(0, 1, 0, 0, pB_p); + + pA += PACK_INT2_SIZE(col_cnt_im2col); + + sum = MacLoads20(0, 0, 0, 0, pA_p, sum); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_leftover counter\n"); + read_cycle_counter(hotloop_leftover_add); + *hotloop_leftover_cycles += hotloop_leftover_add; + #endif + + #ifdef PROFILE + //printf("Started threshold counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + res = thr_cmp(res, sum, *currThr++); + check_store_4x1(res, pOut); + reset_currThr(); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped threshold counter\n"); + read_cycle_counter(threshold_add); + *threshold_cycles += threshold_add; + #endif + + if (!col_cnt_im2col) + { + pA+=num_col_im2col_w; + } + } + else if (out_ch_left == 2) + { + #ifdef PROFILE + //printf("Started hotloop_prep counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + uint8_t *pB = pIn; + + uint32_t *ptrB = (uint32_t *) pB; + + uint8_t *pA2 = (pA + num_col_im2col_w); + + uint32_t *ptrA = (uint32_t *) pA ; + uint32_t *ptrA2 = (uint32_t *) pA2; + + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + ptrA2 = MacLoadInit(1, 0, 1, 0, ptrA2); + + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + + int sum = 0; + int sum2 = 0; + + if (pBias != NULL) + { + sum = ((int) (*pBias++)); + sum2 = ((int) (*pBias++)); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_prep counter\n"); + read_cycle_counter(hotloop_prep_add); + *hotloop_prep_cycles += hotloop_prep_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + + hotloop_2x1( + &sum, &sum2, ptrA, ptrA2, ptrB, num_col_im2col_words + ); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop counter\n"); + read_cycle_counter(hotloop_add); + *hotloop_cycles += hotloop_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop_leftover counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + int col_cnt_im2col = num_col_im2col_c & 0xf; + + if (col_cnt_im2col) + { + uint16_t loop_cnt_im2col_w = (num_col_im2col_words) << 2; + pA+=loop_cnt_im2col_w; + pA2+=loop_cnt_im2col_w; + + uint16_t loop_cnt_im2col_a = (num_col_im2col_words) << 2; + pB+=loop_cnt_im2col_a; + + volatile uint32_t valA, valA2, valB; + // pack the remaining weights and activations into 32-bit vectors + // padding with 0xd9 because ternary_decoder(0xd9) = 0000000000 + if (col_cnt_im2col == 4) + { + valA = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA2; + + valB = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB; + } + else if (col_cnt_im2col == 8) + { + valA = 0xd9 << 24 | 0xd9 << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | *(pA2 + 1) << 8 | *pA2; + + valB = 0xd9 << 24 | 0xd9 << 16 | *(pB + 1) << 8 | *pB; + } + else // col_cnt_im2col == 12 + { + valA = 0xd9 << 24 | *(pA + 2) << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | *(pA2 + 2) << 16 | *(pA2 + 1) << 8 | *pA2; + + valB = 0xd9 << 24 | *(pB + 2) << 16 | *(pB + 1) << 8 | *pB; + } + + uint32_t *pA_p = &valA; + uint32_t *pA2_p = &valA2; + + uint32_t *pB_p = &valB; + + pA_p = MacLoadInit(1, 0, 0, 0, pA_p); + pA2_p = MacLoadInit(1, 0, 1, 0, pA2_p); + pB_p = MacLoadInit(0, 1, 0, 0, pB_p); + + pA += PACK_INT2_SIZE(col_cnt_im2col); + + sum = MacLoads20(0, 0, 0, 0, pA_p, sum); + sum2 = MacLoads20(0, 0, 1, 0, pA2_p, sum2); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_leftover counter\n"); + read_cycle_counter(hotloop_leftover_add); + *hotloop_leftover_cycles += hotloop_leftover_add; + #endif + + #ifdef PROFILE + //printf("Started threshold counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + res = thr_cmp(res, sum, *currThr++); + check_store_4x1(res, pOut); + reset_currThr(); + + res = thr_cmp(res, sum2, *currThr++); + check_store_4x1(res, pOut); + reset_currThr(); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped threshold counter\n"); + read_cycle_counter(threshold_add); + *threshold_cycles += threshold_add; + #endif + + if (!col_cnt_im2col) + { + pA+=(2*num_col_im2col_w); + } + else + { + pA+=num_col_im2col_w; + } + } + else if (out_ch_left == 3) + { + #ifdef PROFILE + //printf("Started hotloop_prep counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + uint8_t *pB = pIn; + + uint32_t *ptrB = (uint32_t *) pB; + + uint8_t *pA2 = (pA + num_col_im2col_w); + uint8_t *pA3 = (pA2 + num_col_im2col_w); + + uint32_t *ptrA = (uint32_t *) pA ; + uint32_t *ptrA2 = (uint32_t *) pA2; + uint32_t *ptrA3 = (uint32_t *) pA3; + ptrA = MacLoadInit(1, 0, 0, 0, ptrA); + ptrA2 = MacLoadInit(1, 0, 1, 0, ptrA2); + ptrA3 = MacLoadInit(1, 0, 2, 0, ptrA3); + + + ptrB = MacLoadInit(0, 1, 0, 0, ptrB); + + int sum = 0; + int sum2 = 0; + int sum3 = 0; + + if (pBias != NULL) + { + sum = ((int) (*pBias++)); + sum2 = ((int) (*pBias++)); + sum3 = ((int) (*pBias++)); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_prep counter\n"); + read_cycle_counter(hotloop_prep_add); + *hotloop_prep_cycles += hotloop_prep_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + + hotloop_3x1( + &sum, &sum2, &sum3, + ptrA, ptrA2, ptrA3, ptrB, num_col_im2col_words + ); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop counter\n"); + read_cycle_counter(hotloop_add); + *hotloop_cycles += hotloop_add; + #endif + + #ifdef PROFILE + //printf("Started hotloop_leftover counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + int col_cnt_im2col = num_col_im2col_c & 0xf; + + if (col_cnt_im2col) + { + uint16_t loop_cnt_im2col_w = (num_col_im2col_words) << 2; + pA+=loop_cnt_im2col_w; + pA2+=loop_cnt_im2col_w; + pA3+=loop_cnt_im2col_w; + + uint16_t loop_cnt_im2col_a = (num_col_im2col_words) << 2; + pB+=loop_cnt_im2col_a; + + volatile uint32_t valA, valA2, valA3, valB; + // pack the remaining weights and activations into 32-bit vectors + // padding with 0xd9 because ternary_decoder(0xd9) = 0000000000 + if (col_cnt_im2col == 4) + { + valA = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA2; + valA3 = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pA3; + + valB = 0xd9 << 24 | 0xd9 << 16 | 0xd9 << 8 | *pB; + } + else if (col_cnt_im2col == 8) + { + valA = 0xd9 << 24 | 0xd9 << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | 0xd9 << 16 | *(pA2 + 1) << 8 | *pA2; + valA3 = 0xd9 << 24 | 0xd9 << 16 | *(pA3 + 1) << 8 | *pA3; + + valB = 0xd9 << 24 | 0xd9 << 16 | *(pB + 1) << 8 | *pB; + } + else // col_cnt_im2col == 12 + { + valA = 0xd9 << 24 | *(pA + 2) << 16 | *(pA + 1) << 8 | *pA; + valA2 = 0xd9 << 24 | *(pA2 + 2) << 16 | *(pA2 + 1) << 8 | *pA2; + valA3 = 0xd9 << 24 | *(pA3 + 2) << 16 | *(pA3 + 1) << 8 | *pA3; + + valB = 0xd9 << 24 | *(pB + 2) << 16 | *(pB + 1) << 8 | *pB; + } + + uint32_t *pA_p = &valA; + uint32_t *pA2_p = &valA2; + uint32_t *pA3_p = &valA3; + + uint32_t *pB_p = &valB; + + pA_p = MacLoadInit(1, 0, 0, 0, pA_p); + pA2_p = MacLoadInit(1, 0, 1, 0, pA2_p); + pA3_p = MacLoadInit(1, 0, 2, 0, pA3_p); + pB_p = MacLoadInit(0, 1, 0, 0, pB_p); + + pA += PACK_INT2_SIZE(col_cnt_im2col); + + sum = MacLoads20(0, 0, 0, 0, pA_p, sum); + sum2 = MacLoads20(0, 0, 1, 0, pA2_p, sum2); + sum3 = MacLoads20(0, 0, 2, 0, pA3_p, sum3); + } + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped hotloop_leftover counter\n"); + read_cycle_counter(hotloop_leftover_add); + *hotloop_leftover_cycles += hotloop_leftover_add; + #endif + + #ifdef PROFILE + //printf("Started threshold counter\n"); + reset_cycle_counter(); + start_cycle_counter(); + #endif + res = thr_cmp(res, sum, *currThr++); + check_store_4x1(res, pOut); + reset_currThr(); + + res = thr_cmp(res, sum2, *currThr++); + check_store_4x1(res, pOut); + reset_currThr(); + + res = thr_cmp(res, sum3, *currThr++); + check_store_4x1(res, pOut); + reset_currThr(); + + #ifdef PROFILE + stop_cycle_counter(); + //printf("Stopped threshold counter\n"); + read_cycle_counter(threshold_add); + *threshold_cycles += threshold_add; + #endif + + if (!col_cnt_im2col) + { + pA+=(3*num_col_im2col_w); + } + else + { + pA+=(2*num_col_im2col_w); + } + } + + *thrc_res = res; + + return pOut; +} + + + + diff --git a/rt_nn_tests/xptnn_conv/xpulp_tnn_matmul_ternary_signed_4x1.h b/rt_nn_tests/xptnn_conv/xpulp_tnn_matmul_ternary_signed_4x1.h new file mode 100644 index 0000000..a847bfb --- /dev/null +++ b/rt_nn_tests/xptnn_conv/xpulp_tnn_matmul_ternary_signed_4x1.h @@ -0,0 +1,96 @@ +#ifndef __XPULP_TNN_MATMUL_TERNARY_SIGNED_4X1_H +#define __XPULP_TNN_MATMUL_TERNARY_SIGNED_4X1_H + +#include "pulp_nn_utils.h" + +uint8_t * __attribute__((noinline)) xpulp_tnn_matmul_ternary_signed_4x1( + uint8_t *pIn, + int8_t *pBias, + uint32_t *pThr, + uint8_t *pOut, + uint8_t *pWeight, + uint16_t num_col_im2col, + uint16_t ch_out, +#ifndef PROFILE + uint32_t *thrc_res); +#else + uint32_t *thrc_res, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles); +#endif + +inline void __attribute__((aligned(4))) hotloop_4x1( + int *sum, int *sum2, int *sum3, int *sum4, + uint32_t *ptrA, uint32_t *ptrA2, uint32_t *ptrA3, uint32_t *ptrA4, + uint32_t *ptrB, int num_col_im2col_words +){ + for (int j=0; j + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN_KERNELS__ +#define __XPULPNN_KERNELS__ + +void xpulp_nn_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif + + +void xpulp_nn_conv_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles); +#endif + +uint8_t *xpulp_nn_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_maxpool_u8( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i8( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u4( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i4( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u2( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i2( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif diff --git a/rt_nn_tests/xptnn_linear/pulp_nn_mix_kernels.h b/rt_nn_tests/xptnn_linear/pulp_nn_mix_kernels.h new file mode 100644 index 0000000..8b2a3c6 --- /dev/null +++ b/rt_nn_tests/xptnn_linear/pulp_nn_mix_kernels.h @@ -0,0 +1,7093 @@ +/* + * pulp_nn_kernels.h + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN-MIXED_KERNELS__ +#define __XPULPNN-MIXED_KERNELS__ + +void xpulp_nn_mix_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif \ No newline at end of file diff --git a/rt_nn_tests/xptnn_linear/pulp_nn_utils.h b/rt_nn_tests/xptnn_linear/pulp_nn_utils.h new file mode 100644 index 0000000..44d2251 --- /dev/null +++ b/rt_nn_tests/xptnn_linear/pulp_nn_utils.h @@ -0,0 +1,2079 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include + +typedef signed short v2s __attribute__((vector_size (4))); + + + +#define min(a,b) ((a)<(b)?(a):(b)) +#define log2(x) __builtin_pulp_fl1(x) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define CHANS_DECOMPR(x) (5*x >> 2) // equivalent to division by 0.8 + +/* Functions for Compressed MAC */ +#define CompressedMAC(sum, ptr, config) asm volatile( \ + "pv.smlsdotsp.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define CompressedMACUnsigned(sum, ptr, config) asm volatile( \ + "pv.smlsdotsup.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define InitNNRF(ptr, config) asm volatile( \ + "pv.smlsdotsp.t x0, %[phtr], %[chonfig];" \ + : [phtr] "+r" (ptr) : [chonfig] "I" (config)) + +#define ThresholdCompress(res, val, thrs) asm volatile( \ + "pv.thrc %[rhes], %[vhal], %[thhrs];" : [rhes] "+r" (res) : [vhal] "r" (val), [thhrs] "r" (thrs)) + +#define GetConfig(a_update, b_update, a_reg, b_reg) a_update << 4 | b_update << 3 | a_reg << 1 | b_reg + +/* Functions for threshold&compress */ +#define check_store(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; \ + incr_val=ch_out_r; } + +#define check_store_4x1(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; } + +#define reset_currThr() \ + if ((uint32_t *) currThr == (uint32_t *) (pThr + ch_out)) { \ + currThr = (v2s *) pThr; \ + } + +#define MacLoads20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define MacLoad20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +/* Functions for compressed min/max */ +#define CompressedMax(res, in1, in2) asm volatile( \ + "pv.max.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define CompressedMin(res, in1, in2) asm volatile( \ + "pv.min.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Max16(res, in1, in2) asm volatile( \ + "pv.max.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Min16(res, in1, in2) asm volatile( \ + "pv.min.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define thr_cmp(state, val, threshs) __builtin_pulp_thresh_compr(state, val, threshs) + +typedef unsigned char v4u __attribute__((vector_size (4))); +typedef signed char v4s __attribute__((vector_size (4))); + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#ifdef __clang__ +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_binsert(dst,not_mask_imm,src,mask_imm,off) +#else +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#endif +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define maxs20(a, b) __builtin_pulp_max20(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define mins20(a, b) __builtin_pulp_min20(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_tnn_zero_mem_ternary(uint8_t * pBuffer, unsigned int size, unsigned int uns) +{ + uint8_t pad_val = 0xd9; + uint32_t pad_vec = 0xd9d9d9d9; + if (uns) { + // if we are using an unsigned kernel, we need to pad with -1 because the hardware will add a +1 to ALL values! + pad_val = 0xff; + pad_vec = 0xffffffff; + } + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u)pad_vec; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=pad_val; + lfover-=4; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_tnn_compare_and_replace_if_larger_ternary(int8_t * base, + int8_t * target, + uint16_t length) +{ + uint8_t mask2 = 0x0c; + uint8_t n_mask2 = ~ mask2; + uint8_t mask4 = 0x30; + uint8_t n_mask4 = ~ mask4; + uint8_t mask6 = 0xc0; + uint8_t n_mask6 = ~ mask6; + uint8_t off2 = 2; + uint8_t off4 = 4; + uint8_t off6 = 6; + + uint8_t *pIn = (uint8_t *) base; + uint8_t *pCom = (uint8_t *) target; + uint8_t *out; + + int cnt = length >> 2; + uint32_t result; + + while(cnt > 0u) + { + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + *((uint32_t *)pIn) = result; + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + if (left>0u) + { + // do the vector max on the whole word - we won't use the leftover bytes + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + + // ...and copy back the relevant bytes of the result to pIn + for (int i=0; i> (8*i)); + + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +#endif diff --git a/rt_nn_tests/xptnn_linear/pulp_nn_utils_xpnn.h b/rt_nn_tests/xptnn_linear/pulp_nn_utils_xpnn.h new file mode 100644 index 0000000..0c783ae --- /dev/null +++ b/rt_nn_tests/xptnn_linear/pulp_nn_utils_xpnn.h @@ -0,0 +1,1937 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include "pmsis.h" +#ifdef GAP_SDK +#include "pulp.h" +#endif + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +#endif diff --git a/rt_nn_tests/xptnn_linear/test.c b/rt_nn_tests/xptnn_linear/test.c new file mode 100644 index 0000000..51cc048 --- /dev/null +++ b/rt_nn_tests/xptnn_linear/test.c @@ -0,0 +1,171 @@ +#include +#include +#include + + +#include "xpulp_tnn_matmul_ternary.h" +#include "xpulp_tnn_matmul_ternary_4x1.h" +#include "xpulp_tnn_conv_ternary.h" +#include "xpulp_tnn_conv_ternary_signed.h" +#ifndef PROFILE // HACKYTIME +#include "xpulp_tnn_conv1d_ternary.h" +#include "xpulp_tnn_conv1d_ternary_signed.h" +#endif +#include "xpulp_tnn_maxpool_ternary.h" +#include "xpulp_tnn_linear_ternary_i32_signed.h" +#include "xpulp_tnn_linear_ternary_i32.h" +#include "pmsis.h" + +#include "data_statstest.h" +//#include "pulp_nn_kernels.h" +#include "pulp_nn_mix_kernels.h" + + +#define start_cycle_counter() asm volatile("csrw 0xCC0, 0x01;") +#define stop_cycle_counter() asm volatile("csrw 0xCC0, 0x00;") +#define read_cycle_counter(x) asm volatile("csrr %0, 0x780;" : "=r" (x)) +#define reset_cycle_counter() asm volatile("csrw 0x780, 0x0;") + +uint8_t im2col[IM2COL_DIM] = {0}; +uint8_t outputs[OUTPUT_DIM] = {0}; + +int32_t outputs_fp[OUTPUT_DIM_FP] = {0}; + +#ifndef PROFILE +int num_cycles; +#else +int im2col_cycles; +int hotloop_prep_cycles; +int hotloop_cycles; +int threshold_cycles; +int requant_cycles; +int hotloop_leftover_cycles; +int matmul4x2_leftover_cycles; +#endif + +void call_krnl_0(); +void test_0(); + +int main(int argc, char *argv[]) +{ +#if KRAKEN_PTEST == 1 + kraken_padframe_aon_pad_gpioa_cfg_rxe_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_trie_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); +#endif + int n_mismatches = 0; + int mismatches_tot = 0; + + + + #ifndef PROFILE + num_cycles = 0; + #endif + + if (get_core_id() == 0) { + printf("===> TEST 0: Running xpulp_tnn_linear_ternary_i32_signed...\n"); + printf(" ch_in/out = [100, 100]\n"); + //printf(" padding_y_top = [%d]\n", padding_y_top); + //printf(" padding_y_bottom = [%d]\n", padding_y_bottom); + //printf(" padding_x_left = [%d]\n", padding_x_left); + //printf(" padding_x_right = [%d]\n", padding_x_right); + //printf(" stride_x = [%d]\n", stride_x); + //printf(" stride_y = [%d]\n", stride_y); + } + test_0(); + #ifndef PROFILE + stop_cycle_counter(); + read_cycle_counter(num_cycles); + if (get_core_id() == 0) { + printf("===> TEST 0: Finished running xpulp_tnn_linear_ternary_i32_signed\n"); + printf("num_cycles = %d\n", num_cycles); + printf("MACs = 10000\n"); + printf("MACs/cycle = %.4f\n", 10000/num_cycles); + } + #endif + if (get_core_id() == 0) { + printf("Checking for mismatches..\n"); + n_mismatches = 0; + + for(int i=0; i < 100; i++) { + if (outputs_fp[i] != exp_outp_0[i]){ + printf("***Mismatch in test 0 at iteration %d: Expected: %x, got: %x\n", i, exp_outp_0[i], outputs_fp[i]); + n_mismatches++; + } + } + } + mismatches_tot += n_mismatches; + + if (get_core_id() == 0) { + printf("Got %d mismatches in %d tests\n", mismatches_tot, 1); + } + return mismatches_tot; +} + + +void call_krnl_0(void) { + uint8_t * pInp; + uint8_t * pIm2ColBuffer; + int8_t * pBias = NULL; + uint8_t * pOut; + int8_t * pWeight; + uint32_t * pThr; + int32_t * pKappa, pLambda; + pInp = inp_l1; + + pOut = outp_l1; + pIm2ColBuffer = im2col_l1; + pWeight = wt_l1; + pThr = threshs_l1; + pKappa = kappa_l1; + pLambda = lambda_l1; +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 1); + } +#endif + xpulp_tnn_linear_ternary_i32_signed( + pInp, + pBias, + (int32_t *) pOut, + pWeight, + 100, + 100); + +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); + } +#endif +} + +void test_0(void) { + // DMA transfer inputs from L2 to L1 + if (pi_core_id() == 0) { + plp_dma_memcpy(pIn_0, inp_l1, 20, 1); + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + // transfer weights + if (pi_core_id() == 0) { + plp_dma_memcpy(pWeight_0, wt_l1, 2000, 1); + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + call_krnl_0(); + // get outputs back with DMA + if (pi_core_id() == 0) { + plp_dma_memcpy(outputs_fp, outp_l1, 400, 0); + plp_dma_barrier(); + } +} + + + + + + + + + + diff --git a/rt_nn_tests/xptnn_linear/xpulp_tnn_conv1d_ternary.h b/rt_nn_tests/xptnn_linear/xpulp_tnn_conv1d_ternary.h new file mode 100644 index 0000000..f1a08a6 --- /dev/null +++ b/rt_nn_tests/xptnn_linear/xpulp_tnn_conv1d_ternary.h @@ -0,0 +1,35 @@ +#ifndef __XPULP_NN_CONV1D_TERNARY_H +#define __XPULP_NN_CONV1D_TERNARY_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv1d_ternary( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t dilation_x); +#else + uint16_t dilation_x, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_linear/xpulp_tnn_conv1d_ternary_signed.h b/rt_nn_tests/xptnn_linear/xpulp_tnn_conv1d_ternary_signed.h new file mode 100644 index 0000000..0f8ba08 --- /dev/null +++ b/rt_nn_tests/xptnn_linear/xpulp_tnn_conv1d_ternary_signed.h @@ -0,0 +1,35 @@ +#ifndef __XPULP_NN_CONV1D_TERNARY_SIGNED_H +#define __XPULP_NN_CONV1D_TERNARY_SIGNED_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv1d_ternary_signed( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t dilation_x); +#else + uint16_t dilation_x, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_linear/xpulp_tnn_conv_ternary.h b/rt_nn_tests/xptnn_linear/xpulp_tnn_conv_ternary.h new file mode 100644 index 0000000..f4a8759 --- /dev/null +++ b/rt_nn_tests/xptnn_linear/xpulp_tnn_conv_ternary.h @@ -0,0 +1,40 @@ +#ifndef __XPULP_NN_CONV_TERNARY_H +#define __XPULP_NN_CONV_TERNARY_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv_ternary( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t stride_y); +#else + uint16_t stride_y, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_linear/xpulp_tnn_conv_ternary_signed.h b/rt_nn_tests/xptnn_linear/xpulp_tnn_conv_ternary_signed.h new file mode 100644 index 0000000..d05c2ab --- /dev/null +++ b/rt_nn_tests/xptnn_linear/xpulp_tnn_conv_ternary_signed.h @@ -0,0 +1,40 @@ +#ifndef __XPULP_NN_CONV_TERNARY_SIGNED_H +#define __XPULP_NN_CONV_TERNARY_SIGNED_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv_ternary_signed( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t stride_y); +#else + uint16_t stride_y, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_linear/xpulp_tnn_linear_ternary_i32.h b/rt_nn_tests/xptnn_linear/xpulp_tnn_linear_ternary_i32.h new file mode 100644 index 0000000..5c353d3 --- /dev/null +++ b/rt_nn_tests/xptnn_linear/xpulp_tnn_linear_ternary_i32.h @@ -0,0 +1,12 @@ +#ifndef __XPULP_TNN_LINEAR_TERNARY_I32_H +#define __XPULP_TNN_LINEAR_TERNARY_I32_H + +void __attribute__((noinline)) xpulp_tnn_linear_ternary_i32( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +#endif \ No newline at end of file diff --git a/rt_nn_tests/xptnn_linear/xpulp_tnn_linear_ternary_i32_signed.c b/rt_nn_tests/xptnn_linear/xpulp_tnn_linear_ternary_i32_signed.c new file mode 100644 index 0000000..36aeb29 --- /dev/null +++ b/rt_nn_tests/xptnn_linear/xpulp_tnn_linear_ternary_i32_signed.c @@ -0,0 +1,95 @@ +#include "pmsis.h" +#include "pulp_nn_utils.h" +#include "xpulp_tnn_linear_ternary_i32_signed.h" + +void __attribute__((noinline)) xpulp_tnn_linear_ternary_i32_signed( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons) + +{ + uint16_t dim_vec_in = dim_vec/5; // number of bytes + uint16_t dim_vec_wt = dim_vec_in; + uint16_t dim_vec_words = dim_vec_in >> 2; // number of words + +#ifdef FC_TEST + int core_id = 0; +#else + int core_id = pi_core_id(); +#endif + + int Log2Core = log2(NUM_CORES); + int chunk = (num_o_neurons >> Log2Core) + ((num_o_neurons & (NUM_CORES-1))!=0); + int start = min(chunk * core_id, num_o_neurons); + int stop = min(start + chunk, num_o_neurons); + int32_t * pOutBuffer = ((int32_t *) pOut) + start; + + for(int i=start; i + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN_KERNELS__ +#define __XPULPNN_KERNELS__ + +void xpulp_nn_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif + + +void xpulp_nn_conv_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u8_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u4_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_u2_i2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_conv1d_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t dilation_x, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_pointwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u8_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i8_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u4_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i4_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i8_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i4_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i8_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i4_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, +#ifndef PROFILE + uint8_t flag_batch_norm); +#else + uint8_t flag_batch_norm, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *requant_cycles, + uint32_t *hotloop_leftover_cycles); +#endif + +uint8_t *xpulp_nn_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_u2_i2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_u2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_matmul_i2_i2_i2_4x4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pOut2, + int8_t *pOut3, + int8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u8_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i8_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u4_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i4_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i8_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i4_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i8( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i4( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_u2_i2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_u2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_depthwise_i2_i2_i2( + int8_t *pIn, + int8_t *pIm2ColBuffer, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int8_t *pWtBuffer, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i8_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u4_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i4_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u2_i32_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_i2_i32_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +void xpulp_nn_linear_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u8_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i8_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u4_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i4_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i8_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u8_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i8_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i4_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u4_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i4_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i8( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i8( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i8( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i4( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i4( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i4( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_u2_i2_i2( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_u2_i2( + int8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_linear_i2_i2_i2( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_vec, + uint16_t num_o_neurons, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_maxpool_u8( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i8( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u4( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i4( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_u2( + uint8_t * pIn, + uint8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_maxpool_i2( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif diff --git a/rt_nn_tests/xptnn_maxpool/pulp_nn_mix_kernels.h b/rt_nn_tests/xptnn_maxpool/pulp_nn_mix_kernels.h new file mode 100644 index 0000000..8b2a3c6 --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/pulp_nn_mix_kernels.h @@ -0,0 +1,7093 @@ +/* + * pulp_nn_kernels.h + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __XPULPNN-MIXED_KERNELS__ +#define __XPULPNN-MIXED_KERNELS__ + +void xpulp_nn_mix_conv_u8_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u8_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u4_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u8_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u4_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i8_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i4_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_mix_conv_u2_u2_i2_4x4( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, + uint16_t stride_y, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u8_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u4_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u8_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u4_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i8_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i4_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +uint8_t *xpulp_nn_mix_matmul_u2_u2_i2_4x4( + uint8_t *pIn, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pOut3, + uint8_t *pOut4, + int8_t *pWeight, + int32_t *pKappa, + int32_t *pLambda, + uint16_t out_mul, + uint16_t out_shift, + uint16_t num_col_im2col, + uint16_t ch_out, + uint8_t flag_relu, + uint8_t flag_batchnorm); + +void xpulp_nn_avgpool_u8_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u8_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i8_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u4_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i4_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u8( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i8( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u8( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i8( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u4( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i4( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u4( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i4( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_u2( + uint8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_u2_i2( + uint8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_u2( + int8_t * pIn, + uint8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_avgpool_i2_i2( + int8_t * pIn, + int8_t * pOut, + int32_t lambda, + uint16_t out_shift, + int32_t out_add, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y, + int flag_requant +); + +void xpulp_nn_add_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i8_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i4_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i8_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i4_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + uint16_t out_mult1, + uint16_t out_mult2, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in); + +void xpulp_nn_add_u2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_u2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_u2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_u2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i8( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i4( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + +void xpulp_nn_add_i2_i2_i2( + uint8_t * pIn1, + uint8_t * pIn2, + uint8_t * pOut, + int32_t in_mult1, + int32_t in_add1, + uint16_t in_shift1, + int32_t in_mult2, + int32_t in_add2, + uint16_t in_shift2, + int32_t out_mult, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag); + + + +#endif \ No newline at end of file diff --git a/rt_nn_tests/xptnn_maxpool/pulp_nn_utils.h b/rt_nn_tests/xptnn_maxpool/pulp_nn_utils.h new file mode 100644 index 0000000..44d2251 --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/pulp_nn_utils.h @@ -0,0 +1,2079 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include + +typedef signed short v2s __attribute__((vector_size (4))); + + + +#define min(a,b) ((a)<(b)?(a):(b)) +#define log2(x) __builtin_pulp_fl1(x) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define CHANS_DECOMPR(x) (5*x >> 2) // equivalent to division by 0.8 + +/* Functions for Compressed MAC */ +#define CompressedMAC(sum, ptr, config) asm volatile( \ + "pv.smlsdotsp.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define CompressedMACUnsigned(sum, ptr, config) asm volatile( \ + "pv.smlsdotsup.t %[shum], %[phtr], %[chonfig];" \ + : [shum] "+r" (sum), [phtr] "+r" (ptr): [chonfig] "I" (config)) + +#define InitNNRF(ptr, config) asm volatile( \ + "pv.smlsdotsp.t x0, %[phtr], %[chonfig];" \ + : [phtr] "+r" (ptr) : [chonfig] "I" (config)) + +#define ThresholdCompress(res, val, thrs) asm volatile( \ + "pv.thrc %[rhes], %[vhal], %[thhrs];" : [rhes] "+r" (res) : [vhal] "r" (val), [thhrs] "r" (thrs)) + +#define GetConfig(a_update, b_update, a_reg, b_reg) a_update << 4 | b_update << 3 | a_reg << 1 | b_reg + +/* Functions for threshold&compress */ +#define check_store(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; \ + incr_val=ch_out_r; } + +#define check_store_4x1(res, pOut) \ + if ((res & 0xe0000000) == 0x00000000) { \ + *pOut = res & 0xff; \ + pOut++; } + +#define reset_currThr() \ + if ((uint32_t *) currThr == (uint32_t *) (pThr + ch_out)) { \ + currThr = (v2s *) pThr; \ + } + +#define MacLoads20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define MacLoad20(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup20_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +/* Functions for compressed min/max */ +#define CompressedMax(res, in1, in2) asm volatile( \ + "pv.max.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define CompressedMin(res, in1, in2) asm volatile( \ + "pv.min.t %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Max16(res, in1, in2) asm volatile( \ + "pv.max.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define Min16(res, in1, in2) asm volatile( \ + "pv.min.c %[res_], %[in1_], %[in2_];" : [res_] "=r" (res) : [in1_] "r" (in1), [in2_] "r" (in2)) + +#define thr_cmp(state, val, threshs) __builtin_pulp_thresh_compr(state, val, threshs) + +typedef unsigned char v4u __attribute__((vector_size (4))); +typedef signed char v4s __attribute__((vector_size (4))); + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#ifdef __clang__ +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_binsert(dst,not_mask_imm,src,mask_imm,off) +#else +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#endif +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define maxs20(a, b) __builtin_pulp_max20(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define mins20(a, b) __builtin_pulp_min20(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_tnn_zero_mem_ternary(uint8_t * pBuffer, unsigned int size, unsigned int uns) +{ + uint8_t pad_val = 0xd9; + uint32_t pad_vec = 0xd9d9d9d9; + if (uns) { + // if we are using an unsigned kernel, we need to pad with -1 because the hardware will add a +1 to ALL values! + pad_val = 0xff; + pad_vec = 0xffffffff; + } + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u)pad_vec; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=pad_val; + lfover-=4; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_tnn_compare_and_replace_if_larger_ternary(int8_t * base, + int8_t * target, + uint16_t length) +{ + uint8_t mask2 = 0x0c; + uint8_t n_mask2 = ~ mask2; + uint8_t mask4 = 0x30; + uint8_t n_mask4 = ~ mask4; + uint8_t mask6 = 0xc0; + uint8_t n_mask6 = ~ mask6; + uint8_t off2 = 2; + uint8_t off4 = 4; + uint8_t off6 = 6; + + uint8_t *pIn = (uint8_t *) base; + uint8_t *pCom = (uint8_t *) target; + uint8_t *out; + + int cnt = length >> 2; + uint32_t result; + + while(cnt > 0u) + { + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + *((uint32_t *)pIn) = result; + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + if (left>0u) + { + // do the vector max on the whole word - we won't use the leftover bytes + uint32_t in1 = *((uint32_t *)pIn); + uint32_t in2 = *((int32_t *)pCom); + result = maxs20(in1, in2); + + // ...and copy back the relevant bytes of the result to pIn + for (int i=0; i> (8*i)); + + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +#endif diff --git a/rt_nn_tests/xptnn_maxpool/pulp_nn_utils_xpnn.h b/rt_nn_tests/xptnn_maxpool/pulp_nn_utils_xpnn.h new file mode 100644 index 0000000..0c783ae --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/pulp_nn_utils_xpnn.h @@ -0,0 +1,1937 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include "pmsis.h" +#ifdef GAP_SDK +#include "pulp.h" +#endif + +#define bitext(x,size,off) __builtin_pulp_bextract(x,size,off) +#define bitextu(x,size,off) __builtin_pulp_bextractu(x,size,off) +#define bitins(dst,not_mask_imm,src,mask_imm,off) __builtin_pulp_binsert(dst,not_mask_imm,src,mask_imm,off) +#define pack(x,y,z,t) __builtin_pulp_pack4(x,y,z,t) +#define max4(a,b) __builtin_pulp_maxu4(a,b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define max32(a,b) __builtin_pulp_maxusi(a,b) +#define maxs32(a,b) __builtin_pulp_maxsi(a,b) +#define min32(a,b) __builtin_pulp_minusi(a,b) +#define mins32(a,b) __builtin_pulp_minsi(a,b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define avg4(a,b) __builtin_pulp_avgu4(a,b) +#define avg8(a,b) __builtin_pulp_avgu8(a,b) +#define avg16(a,b) __builtin_pulp_avgu16(a,b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("":::"memory") + +#define LEGACY_MODE(x) asm volatile ("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile ("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile ("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile ("csrw 0x100, %0" :: "r" (x)) +#define W_ADDRESS(x) asm volatile ("csrw 0x101, %0" :: "r" (x)) +#define A_STRIDE(x) asm volatile ("csrw 0x102, %0":: "r" (x)) +#define W_STRIDE(x) asm volatile ("csrw 0x103, %0":: "r" (x)) +#define A_ROLLBACK(x) asm volatile ("csrw 0x104, %0":: "r" (x)) +#define W_ROLLBACK(x) asm volatile ("csrw 0x105, %0":: "r" (x)) +#define A_SKIP(x) asm volatile ("csrwi 0x106," x) +#define W_SKIP(x) asm volatile ("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u2 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i2 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i2 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i2 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u4 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i4 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i4 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i4 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) pulp_nn_quant_u8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_bn_quant_u8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) pulp_nn_add_quant_u8 ( + uint8_t pix1, + uint8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + uint32_t integer_image = pix1*m1 + pix2*m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) pulp_nn_quant_i8 ( + int32_t phi, + int16_t m, + int8_t d + ) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_bn_quant_i8 ( + int32_t phi, + int32_t k, + int32_t lambda, + int8_t d + ) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) pulp_nn_add_quant_i8 ( + int8_t pix1, + int8_t pix2, + int16_t m1, + int16_t m2, + int8_t d + ) { + int32_t integer_image = pix1*m1 + pix2*m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + + +static uint8_t __attribute__((noinline)) pulp_nn_u4_quant(int input, int16_t * pThr) +{ + if(input <= pThr[7] ) + { + if( input <= pThr[3]) + { + if( input <= pThr[1]) + { + if( input <= pThr[0]) + return 0; + else + return 1; + } + else + { + if( input <= pThr[2]) + return 2; + else + return 3; + } + } + else + { + if( input <= pThr[5]) + { + if( input <= pThr[4]) + return 4; + else + return 5; + } + else + { + if( input <= pThr[6]) + return 6; + else + return 7; + } + } + } + else + { + if( input <= pThr[11]) + { + if( input <= pThr[9]) + { + if( input <= pThr[8]) + return 8; + else + return 9; + } + else + { + if( input <= pThr[10]) + return 10; + else + return 11; + } + } + else + { + if( input <= pThr[13]) + { + if( input <= pThr[12]) + return 12; + else + return 13; + } + else + { + if( input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) pulp_nn_u2_quant(int input, int16_t * pThr) +{ + if( input <= pThr[1]) + { + if( input <= pThr[0]) + { + return 0; + } + else + { + return 1; + } + } + else + { + if( input <= pThr[2]) + { + return 2; + } + else + { + return 3; + } + } +} + +/* + * Common + */ + + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r( int8_t *pSrc) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + v4s res = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + v4u res = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t) bitextu((int) Src, 2, 0); + bext2 = (int8_t) bitextu((int) Src, 2, 2); + bext3 = (int8_t) bitextu((int) Src, 2, 4); + bext4 = (int8_t) bitextu((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t) bitextu((int) Src, 2, 8); + bext2 = (int8_t) bitextu((int) Src, 2, 10); + bext3 = (int8_t) bitextu((int) Src, 2, 12); + bext4 = (int8_t) bitextu((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i4_to_i8( int8_t *pSrc, int8_t *pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 4, 0); + bext2 = (int8_t) bitext((int) Src, 4, 4); + bext3 = (int8_t) bitext((int) Src, 4, 8); + bext4 = (int8_t) bitext((int) Src, 4, 12); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 4, 16); + bext2 = (int8_t) bitext((int) Src, 4, 20); + bext3 = (int8_t) bitext((int) Src, 4, 24); + bext4 = (int8_t) bitext((int) Src, 4, 28); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 4); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 8); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 12); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 4, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 4, 20); + bext3 = (uint8_t) bitextu((unsigned int) Src, 4, 24); + bext4 = (uint8_t) bitextu((unsigned int) Src, 4, 28); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i8( int8_t * pSrc, int8_t * pDst) +{ + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + *((v4s*)pDst) = pack((int8_t) bext1, (int8_t) bext2, (int8_t) bext3, (int8_t) bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u8(uint8_t * pSrc, uint8_t * pDst) +{ + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + MemoryFence(); + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + *((v4u*)pDst) = pack((uint8_t) bext1, (uint8_t) bext2, (uint8_t) bext3, (uint8_t) bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) pulp_nn_i2_to_i4( int8_t * pSrc, int8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4s Src = *((v4s*) pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (int8_t) bitext((int) Src, 2, 0); + bext2 = (int8_t) bitext((int) Src, 2, 2); + bext3 = (int8_t) bitext((int) Src, 2, 4); + bext4 = (int8_t) bitext((int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 8); + bext2 = (int8_t) bitext((int) Src, 2, 10); + bext3 = (int8_t) bitext((int) Src, 2, 12); + bext4 = (int8_t) bitext((int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (int8_t) bitext((int) Src, 2, 16); + bext2 = (int8_t) bitext((int) Src, 2, 18); + bext3 = (int8_t) bitext((int) Src, 2, 20); + bext4 = (int8_t) bitext((int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t) bitext((int) Src, 2, 24); + bext2 = (int8_t) bitext((int) Src, 2, 26); + bext3 = (int8_t) bitext((int) Src, 2, 28); + bext4 = (int8_t) bitext((int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s*)pDst) = pack((int8_t) out1, (int8_t) out2, (int8_t) out3, (int8_t) out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) pulp_nn_u2_to_u4( uint8_t * pSrc, uint8_t * pDst) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + v4u Src = *((v4u*) pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc+=4; + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 0); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 2); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 4); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 8); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 10); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 12); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + MemoryFence(); + + pDst+=4; + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 16); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 18); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 20); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t) bitextu((unsigned int) Src, 2, 24); + bext2 = (uint8_t) bitextu((unsigned int) Src, 2, 26); + bext3 = (uint8_t) bitextu((unsigned int) Src, 2, 28); + bext4 = (uint8_t) bitextu((unsigned int) Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u*)pDst) = pack((uint8_t) out1, (uint8_t) out2, (uint8_t) out3, (uint8_t) out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) pulp_zero_mem(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) pulp_nn_im2col_u2_to_u8(uint8_t * pInput, uint8_t * pOutput, unsigned int blockSize) +{ + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i> 2; + + while(cnt > 0u) + { + inp = *((v4u*)pIn); + com = *((v4u*)pCom); + + *((v4u*)pIn) = max4(inp, com); + + pCom+=4; + pIn+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + if(*pIn<*pCom) + *pIn=*pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i8( + int8_t * base, int8_t * target, uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u8(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while(cnt > 0u) + { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t*)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t*)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 4, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 4, 4); + int8_t inB0 = (int8_t) bitext((int) *pCom, 4, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 4, 4); + + if(inA00u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u*)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[1], com[1]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[2], com[2]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u*)out) = max4(inp[3], com[3]); + + inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +static void __attribute__((noinline)) pulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while(cnt > 0u) + { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s*)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[1], com[1]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[2], com[2]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s*)out) = maxs4(inp[3], com[3]); + + inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((unsigned int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((unsigned int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((unsigned int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((unsigned int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((unsigned int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((unsigned int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((unsigned int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((unsigned int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) pulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t) bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u8(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x3; + for (int i=0; i<(size>>2); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover--; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u4(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0x7; + for (int i=0; i<(size>>3); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=2; + } +} + +static void __attribute__((noinline)) xpulp_nn_zero_mem_u2(uint8_t * pBuffer, unsigned int size) +{ + int lfover = size &0xf; + for (int i=0; i<(size>>4); i++) + { + *((v4u *)pBuffer) = (v4u){0,0,0,0}; + MemoryFence(); + pBuffer+=4; + } + while(lfover) + { + *pBuffer++=0; + lfover-=4; + } +} + + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + if(inA0> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u4(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask = 0xf0; + int8_t n_mask = ~ mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) + { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 4, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 4, 4); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 4, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t*)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) xpulp_nn_compare_and_replace_if_larger_i2(int8_t * base, + int8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + int8_t inA0 = (int8_t) bitext((int) *pIn, 2, 0); + int8_t inA1 = (int8_t) bitext((int) *pIn, 2, 2); + int8_t inA2 = (int8_t) bitext((int) *pIn, 2, 4); + int8_t inA3 = (int8_t) bitext((int) *pIn, 2, 6); + v4s inA4 = pack((int8_t) inA0, (int8_t) inA1, (int8_t) inA2, (int8_t) inA3); + int8_t inB0 = (int8_t) bitext((int) *pCom, 2, 0); + int8_t inB1 = (int8_t) bitext((int) *pCom, 2, 2); + int8_t inB2 = (int8_t) bitext((int) *pCom, 2, 4); + int8_t inB3 = (int8_t) bitext((int) *pCom, 2, 6); + v4s inB4 = pack((int8_t) inB0, (int8_t) inB1, (int8_t) inB2, (int8_t) inB3); + + *((v4s*)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) xpulp_nn_avg_and_replace_u2(uint8_t * base, + uint8_t * target, + uint16_t length) +{ + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~ mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~ mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~ mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while(cnt > 0u) + { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn+=4; + pCom+=4; + cnt--; + } + + int left = length & 0x3; + while (left>0u) + { + uint8_t inA0 = (uint8_t) bitextu((unsigned int) *pIn, 2, 0); + uint8_t inA1 = (uint8_t) bitextu((unsigned int) *pIn, 2, 2); + uint8_t inA2 = (uint8_t) bitextu((unsigned int) *pIn, 2, 4); + uint8_t inA3 = (uint8_t) bitextu((unsigned int) *pIn, 2, 6); + v4u inA4 = pack((uint8_t) inA0, (uint8_t) inA1, (uint8_t) inA2, (uint8_t) inA3); + uint8_t inB0 = (uint8_t) bitextu((unsigned int) *pCom, 2, 0); + uint8_t inB1 = (uint8_t) bitextu((unsigned int) *pCom, 2, 2); + uint8_t inB2 = (uint8_t) bitextu((unsigned int) *pCom, 2, 4); + uint8_t inB3 = (uint8_t) bitextu((unsigned int) *pCom, 2, 6); + v4u inB4 = pack((uint8_t) inB0, (uint8_t) inB1, (uint8_t) inB2, (uint8_t) inB3); + + *((v4u*)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t) bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t*)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + + +#endif diff --git a/rt_nn_tests/xptnn_maxpool/test.c b/rt_nn_tests/xptnn_maxpool/test.c new file mode 100644 index 0000000..d9186c0 --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/test.c @@ -0,0 +1,171 @@ +#include +#include +#include + + +#include "xpulp_tnn_matmul_ternary.h" +#include "xpulp_tnn_matmul_ternary_4x1.h" +#include "xpulp_tnn_conv_ternary.h" +#include "xpulp_tnn_conv_ternary_signed.h" +#ifndef PROFILE // HACKYTIME +#include "xpulp_tnn_conv1d_ternary.h" +#include "xpulp_tnn_conv1d_ternary_signed.h" +#endif +#include "xpulp_tnn_maxpool_ternary.h" +#include "xpulp_tnn_linear_ternary_i32_signed.h" +#include "xpulp_tnn_linear_ternary_i32.h" +#include "pmsis.h" + +#include "data_statstest.h" +//#include "pulp_nn_kernels.h" +#include "pulp_nn_mix_kernels.h" + + +#define start_cycle_counter() asm volatile("csrw 0xCC0, 0x01;") +#define stop_cycle_counter() asm volatile("csrw 0xCC0, 0x00;") +#define read_cycle_counter(x) asm volatile("csrr %0, 0x780;" : "=r" (x)) +#define reset_cycle_counter() asm volatile("csrw 0x780, 0x0;") + +uint8_t im2col[IM2COL_DIM] = {0}; +uint8_t outputs[OUTPUT_DIM] = {0}; + +int32_t outputs_fp[OUTPUT_DIM_FP] = {0}; + +#ifndef PROFILE +int num_cycles; +#else +int im2col_cycles; +int hotloop_prep_cycles; +int hotloop_cycles; +int threshold_cycles; +int requant_cycles; +int hotloop_leftover_cycles; +int matmul4x2_leftover_cycles; +#endif + +void call_krnl_0(); +void test_0(); + +int main(int argc, char *argv[]) +{ +#if KRAKEN_PTEST == 1 + kraken_padframe_aon_pad_gpioa_cfg_rxe_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_trie_set(24, 0); + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); +#endif + int n_mismatches = 0; + int mismatches_tot = 0; + + + + #ifndef PROFILE + num_cycles = 0; + #endif + + if (get_core_id() == 0) { + printf("===> TEST 0: Running xpulp_tnn_maxpool_ternary...\n"); + printf(" dims_in = [32, 32]\n"); + printf(" dims_kernel = [2, 2]\n"); + printf(" ch_in/out = [40, 40]\n"); + //printf(" padding_y_top = [%d]\n", padding_y_top); + //printf(" padding_y_bottom = [%d]\n", padding_y_bottom); + //printf(" padding_x_left = [%d]\n", padding_x_left); + //printf(" padding_x_right = [%d]\n", padding_x_right); + //printf(" stride_x = [%d]\n", stride_x); + //printf(" stride_y = [%d]\n", stride_y); + } + test_0(); + #ifndef PROFILE + stop_cycle_counter(); + read_cycle_counter(num_cycles); + if (get_core_id() == 0) { + printf("===> TEST 0: Finished running xpulp_tnn_maxpool_ternary\n"); + printf("num_cycles = %d\n", num_cycles); + printf("MACs = 1638400\n"); + printf("MACs/cycle = %.4f\n", 1638400/num_cycles); + } + #endif + if (get_core_id() == 0) { + printf("Checking for mismatches..\n"); + n_mismatches = 0; + + for(int i=0; i < 2048; i++) { + if (outputs[i] != exp_outp_0[i]){ + printf("***Mismatch in test 0 at iteration %d: Expected: %x, got: %x\n", i, exp_outp_0[i], outputs[i]); + n_mismatches++; + } + } + } + mismatches_tot += n_mismatches; + + if (get_core_id() == 0) { + printf("Got %d mismatches in %d tests\n", mismatches_tot, 1); + } + return mismatches_tot; +} + + +void call_krnl_0(void) { + uint8_t * pInp; + uint8_t * pIm2ColBuffer; + int8_t * pBias = NULL; + uint8_t * pOut; + int8_t * pWeight; + uint32_t * pThr; + int32_t * pKappa, pLambda; + pInp = inp_l1; + + pOut = outp_l1; +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 1); + } +#endif + xpulp_tnn_maxpool_ternary( + pInp, + pOut, + 32, + 32, + 40, + 16, + 16, + 2, + 2, + 0, + 0, + 0, + 0, + 2, + 2); + +#if KRAKEN_PTEST == 1 + if (pi_core_id() == 0) { + kraken_padframe_aon_pad_gpioa_cfg_chip2pad_set(24, 0); + } +#endif +} + +void test_0(void) { + // DMA transfer inputs from L2 to L1 + if (pi_core_id() == 0) { + plp_dma_memcpy(pIn_0, inp_l1, 8192, 1); + plp_dma_barrier(); + } + pi_cl_team_barrier(0); + call_krnl_0(); + // get outputs back with DMA + if (pi_core_id() == 0) { + plp_dma_memcpy(outputs, outp_l1, 2048, 0); + plp_dma_barrier(); + } +} + + + + + + + + + + diff --git a/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv1d_ternary.h b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv1d_ternary.h new file mode 100644 index 0000000..f1a08a6 --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv1d_ternary.h @@ -0,0 +1,35 @@ +#ifndef __XPULP_NN_CONV1D_TERNARY_H +#define __XPULP_NN_CONV1D_TERNARY_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv1d_ternary( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t dilation_x); +#else + uint16_t dilation_x, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv1d_ternary_signed.h b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv1d_ternary_signed.h new file mode 100644 index 0000000..0f8ba08 --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv1d_ternary_signed.h @@ -0,0 +1,35 @@ +#ifndef __XPULP_NN_CONV1D_TERNARY_SIGNED_H +#define __XPULP_NN_CONV1D_TERNARY_SIGNED_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv1d_ternary_signed( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t dilation_x); +#else + uint16_t dilation_x, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv_ternary.h b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv_ternary.h new file mode 100644 index 0000000..f4a8759 --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv_ternary.h @@ -0,0 +1,40 @@ +#ifndef __XPULP_NN_CONV_TERNARY_H +#define __XPULP_NN_CONV_TERNARY_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv_ternary( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t stride_y); +#else + uint16_t stride_y, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv_ternary_signed.h b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv_ternary_signed.h new file mode 100644 index 0000000..d05c2ab --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_conv_ternary_signed.h @@ -0,0 +1,40 @@ +#ifndef __XPULP_NN_CONV_TERNARY_SIGNED_H +#define __XPULP_NN_CONV_TERNARY_SIGNED_H +#include +#include "pulp_nn_utils.h" + + + +// TODO: review argument order +void __attribute__((noinline)) xpulp_tnn_conv_ternary_signed( + uint8_t *pIn, + uint8_t *pIm2ColBuffer, + int8_t *pBias, + uint8_t *pOut, + uint8_t *pWeight, + uint32_t *pThr, + uint16_t dim_in_x, + uint16_t dim_in_y, + uint16_t ch_in, + uint16_t dim_out_x, + uint16_t dim_out_y, + uint16_t ch_out, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_y_top, + uint16_t padding_y_bottom, + uint16_t padding_x_left, + uint16_t padding_x_right, + uint16_t stride_x, +#ifndef PROFILE + uint16_t stride_y); +#else + uint16_t stride_y, + uint32_t *im2col_cycles, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles, + uint32_t *matmul4x2_leftover_cycles); +#endif +#endif diff --git a/rt_nn_tests/xptnn_maxpool/xpulp_tnn_linear_ternary_i32.h b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_linear_ternary_i32.h new file mode 100644 index 0000000..5c353d3 --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_linear_ternary_i32.h @@ -0,0 +1,12 @@ +#ifndef __XPULP_TNN_LINEAR_TERNARY_I32_H +#define __XPULP_TNN_LINEAR_TERNARY_I32_H + +void __attribute__((noinline)) xpulp_tnn_linear_ternary_i32( + uint8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +#endif \ No newline at end of file diff --git a/rt_nn_tests/xptnn_maxpool/xpulp_tnn_linear_ternary_i32_signed.h b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_linear_ternary_i32_signed.h new file mode 100644 index 0000000..218c5c4 --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_linear_ternary_i32_signed.h @@ -0,0 +1,12 @@ +#ifndef __XPULP_TNN_LINEAR_TERNARY_I32_SIGNED_H +#define __XPULP_TNN_LINEAR_TERNARY_I32_SIGNED_H + +void __attribute__((noinline)) xpulp_tnn_linear_ternary_i32_signed( + int8_t *pIn, + int8_t *pBias, + int8_t *pOut, + int8_t *pWeight, + uint16_t dim_vec, + uint16_t num_o_neurons); + +#endif \ No newline at end of file diff --git a/rt_nn_tests/xptnn_maxpool/xpulp_tnn_matmul_ternary.h b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_matmul_ternary.h new file mode 100644 index 0000000..914daee --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_matmul_ternary.h @@ -0,0 +1,113 @@ +#ifndef __XPULP_TNN_MATMUL_TERNARY_H +#define __XPULP_TNN_MATMUL_TERNARY_H + +#include "pulp_nn_utils.h" + +uint8_t * __attribute__((noinline)) xpulp_tnn_matmul_ternary( + uint8_t *pIn, + int8_t *pBias, + uint32_t *pThr, + uint8_t *pOut, + uint8_t *pOut2, + uint8_t *pWeight, + uint16_t num_col_im2col, + uint16_t ch_out, +#ifndef PROFILE + uint32_t *thrc_res1, + uint32_t *thrc_res2); +#else + uint32_t *thrc_res1, + uint32_t *thrc_res2, + uint32_t *hotloop_prep_cycles, + uint32_t *hotloop_cycles, + uint32_t *threshold_cycles, + uint32_t *hotloop_leftover_cycles); +#endif + +inline void __attribute__((aligned(4))) hotloop_4x2( + int *sum, int *sum2, int *sum3, int *sum4, int *sum5, int *sum6, int *sum7, int *sum8, + uint32_t *ptrA, uint32_t *ptrA2, uint32_t *ptrA3, uint32_t *ptrA4, + uint32_t *ptrB, uint32_t *ptrB2, int num_col_im2col_words +){ + for (int j=0; j + * Nazareno Bruschi + * Angelo Garofalo + * + * Copyright (C) 2018-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" +#include "pulp_nn_utils.h" + + +void __attribute__ ((noinline)) xpulp_tnn_maxpool_ternary( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + // ch_im_in: UNCOMPRESSED number of input/output channels + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y) + +{ + #ifdef FC_TEST + int n_cores = 1; + int core_id = 0; + # else + int n_cores = NUM_CORES; + int core_id = pi_core_id(); + if (dim_im_in_y < NUM_CORES) + { + n_cores = dim_im_in_y; + } + #endif + int Log2Core = log2(n_cores); + // number of bytes: 5 els per byte + int ch_im_in_r = ch_im_in/5; + + + int chunck = (dim_im_in_y >> Log2Core) + ((dim_im_in_y & (NUM_CORES-1))!=0); + + int start = min(chunck * core_id, dim_im_in_y); + int stop = min(start + chunck, dim_im_in_y); + int i_x, i_y; + + // logic is the same as for i2, except we use the ternary version of xpulp_nn_compare_and_replace... + for (i_y = start; i_y < stop; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + /* for each output pixel */ + int8_t *target = pIn + (i_y * dim_im_in_x + i_x) * ch_im_in_r; + uint8_t *win_start; + uint8_t *win_stop; + if (i_x * stride_x - padding_l < 0) + { + win_start = target; + } + else + { + win_start = pIn + (i_y * dim_im_in_x + i_x * stride_x - padding_l) * ch_im_in_r; + } + + if (i_x * stride_x - padding_l + dim_kernel_x >= dim_im_in_x) + { + win_stop = pIn + (i_y * dim_im_in_x + dim_im_in_x) * ch_im_in_r; + } + else + { + win_stop = pIn + (i_y * dim_im_in_x + i_x * stride_x - padding_l + dim_kernel_x) * ch_im_in_r; + } + + /* first step is to copy over initial data */ + for (int i = 0; i< ch_im_in_r; i++) target[i] = win_start[i]; + + /* start the max operation from the second part */ + win_start += ch_im_in_r; + for (; win_start < win_stop; win_start += ch_im_in_r) + { + xpulp_tnn_compare_and_replace_if_larger_ternary(target, win_start, ch_im_in_r); + } + } + } +#ifndef FC_TEST + pi_cl_team_barrier(0); +#endif + if (dim_im_out_y < NUM_CORES) + { + n_cores = dim_im_out_y; + } + Log2Core = log2(n_cores); + int chunck2 = (dim_im_out_y >> Log2Core) + ((dim_im_out_y & (NUM_CORES-1))!=0); + int start2 = chunck2 * core_id; + int stop2 = min(start2 + chunck2, dim_im_out_y); + + /* then does the pooling along y axis */ + for (i_y = start2; i_y < stop2; i_y++) + { + /* for each output row */ + int8_t *target = pOut + i_y * dim_im_out_x * ch_im_in_r; + int8_t *row_start; + int8_t *row_end; + /* setting the starting row */ + if (i_y * stride_y - padding_t < 0) + { + row_start = pIn; + } + else + { + row_start = pIn + (i_y * stride_y - padding_t) * dim_im_in_x * ch_im_in_r; + } + /* setting the stopping row */ + if (i_y * stride_y - padding_t + dim_kernel_y >= dim_im_in_y) + { + row_end = pIn + dim_im_in_y * dim_im_in_x * ch_im_in_r; + } + else + { + row_end = pIn + (i_y * stride_y - padding_t + dim_kernel_y) * dim_im_in_x * ch_im_in_r; + } + + /* copy over the first row */ + for (int i = 0; i< dim_im_out_x * ch_im_in_r; i++) + { + target[i] = (int8_t) row_start[i]; + } + /* move over to next row */ + row_start += ch_im_in_r * dim_im_in_x; + + for (; row_start < row_end; row_start += dim_im_in_x * ch_im_in_r) + { + xpulp_tnn_compare_and_replace_if_larger_ternary(target, row_start, dim_im_out_x * ch_im_in_r); + } + } + #ifndef FC_TEST + pi_cl_team_barrier(0); + #endif +} diff --git a/rt_nn_tests/xptnn_maxpool/xpulp_tnn_maxpool_ternary.h b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_maxpool_ternary.h new file mode 100644 index 0000000..2b65fca --- /dev/null +++ b/rt_nn_tests/xptnn_maxpool/xpulp_tnn_maxpool_ternary.h @@ -0,0 +1,20 @@ +#ifndef __XPULP_TNN_MAXPOOL_TERNARY_H +#define __XPULP_TNN_MAXPOOL_TERNARY_H +void __attribute__ ((noinline)) xpulp_tnn_maxpool_ternary( + int8_t * pIn, + int8_t * pOut, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + // ch_im_in: UNCOMPRESSED number of input/output channels + uint16_t ch_im_in, + uint16_t dim_im_out_x, + uint16_t dim_im_out_y, + uint16_t dim_kernel_x, + uint16_t dim_kernel_y, + uint16_t padding_t, + uint16_t padding_b, + uint16_t padding_l, + uint16_t padding_r, + uint16_t stride_x, + uint16_t stride_y); +#endif