diff --git a/examples/targets/carfield/config/Makefile b/examples/targets/carfield/config/Makefile index 8cce6e5..9ea9514 100644 --- a/examples/targets/carfield/config/Makefile +++ b/examples/targets/carfield/config/Makefile @@ -47,7 +47,7 @@ export PULP_SDK_HOME = $(PULPRT_HOME) PULP_APPS = pulp_cluster_runtime PULP_APP_SRCS = $(MATCH_COMMON_SRCS) $(MATCH_pulp_cluster_OFFLOAD_SRC) $(PULPRT_HOME)/lib/libc/minimal/io.c $(PULPRT_HOME)/lib/libc/minimal/prf.c -PULP_CFLAGS = -O3 $(MATCH_INCLUDES) -DCLUSTER_COMPILATION -DARCHI_CLUSTER_NB_PE=8 -I$(PULPRT_HOME)/lib/libc/minimal/include -D__pulp_cluster__ +PULP_CFLAGS = -O3 $(MATCH_INCLUDES) -DCLUSTER_COMPILATION -DARCHI_CLUSTER_NB_PE=8 -I$(PULPRT_HOME)/lib/libc/minimal/include -D__pulp_cluster__ -Dhalf=float16 -D_Float16=float16 PULPD_ELF_REMOVE_SECTIONS := --remove-section .l1cluster_g --remove-section .bss_l1 -include $(PULP_SDK_HOME)/install/rules/pulp.mk @@ -61,7 +61,7 @@ build-offload: $(PULPD_RISCV)-objcopy $(PULPD_ELF_REMOVE_SECTIONS) $(BUILD_DIR)/build/pulp_cluster_runtime/pulp_cluster_runtime; @echo "Generating objdump..." - $(PULPD_RISCV)-objdump -d -S $(BUILD_DIR)/build/pulp_cluster_runtime/pulp_cluster_runtime > $(BUILD_DIR)/build/pulp_cluster_runtime/pulp_cluster_runtime.dump; + $(PULPD_RISCV)-objdump -drwCS $(BUILD_DIR)/build/pulp_cluster_runtime/pulp_cluster_runtime > $(BUILD_DIR)/build/pulp_cluster_runtime/pulp_cluster_runtime.dump; @echo "Runtime offload build done." @@ -74,7 +74,7 @@ build-offload: CAR_SW_DIR := $(CAR_ROOT)/sw CHS_ROOT ?= $(shell $(BENDER) path cheshire) -CHS_SW_GCC_BINROOT ?= /usr/pack/riscv-1.0-kgf/riscv64-gcc-11.2.0/bin +CHS_SW_GCC_BINROOT ?= /usr/pack/riscv-1.0-kgf/riscv64-gcc-14.2.0/bin -include $(CHS_ROOT)/cheshire.mk CHS_BOOTMODE ?= 0 # default passive bootmode @@ -105,7 +105,7 @@ $(HOST_LIB): $(HOST_LIB_SRCS_O) $(CAR_SW_DIR)/%.car.o: $(CAR_SW_DIR)/%.c $(CHS_SW_CC) $(CAR_SW_INCLUDES) $(CHS_SW_CCFLAGS) -c $< -o $@ -HOST_FLAGS := -T$(HOST_LD_SCRIPT) -Wno-pointer-to-int-cast -DIntClustNumCores=8 -g +HOST_FLAGS := -T$(HOST_LD_SCRIPT) -Wno-pointer-to-int-cast -DIntClustNumCores=8 -Dhalf=_Float16 -g -march=rv64gc_zifencei @@ -123,5 +123,5 @@ build-host: $(HOST_LIB) build-payload @echo $(HOST_LIB_SRCS_O) $(CHS_SW_CC) $(HOST_INCLUDES) $(MATCH_INCLUDES) $(CHS_SW_LDFLAGS) $(HOST_FLAGS) -o $(BUILD_DIR)/host.elf $(HOST_LIB) $(MATCH_COMMON_SRCS) $(MATCH_HOST_SRC) $(CHS_SW_LIBS) @echo "Generating objdump" - @$(CHS_SW_OBJDUMP) -d -S $(BUILD_DIR)/host.elf > $(BUILD_DIR)/host.dump + @$(CHS_SW_OBJDUMP) -drwCS $(BUILD_DIR)/host.elf > $(BUILD_DIR)/host.dump @echo "Host build done" \ No newline at end of file diff --git a/examples/targets/carfield/config/link.ld b/examples/targets/carfield/config/link.ld index cd9397a..281ffe1 100644 --- a/examples/targets/carfield/config/link.ld +++ b/examples/targets/carfield/config/link.ld @@ -77,11 +77,9 @@ SECTIONS { *(.bulk.*) } > l2 - .l2_heap : ALIGN(32) { + .l2_heap (NOLOAD) : ALIGN(32) { __l2_heap_start = .; - *(.l2_heap) - *(.l2_heap.*) - . = ALIGN(32); + . = ORIGIN(l2) + LENGTH(l2) - LENGTH(l2_common); __l2_heap_end = .; } > l2 diff --git a/examples/targets/carfield/libs/carfield_lib/include/carfield.h b/examples/targets/carfield/libs/carfield_lib/include/carfield.h index 7698cde..4c67670 100644 --- a/examples/targets/carfield/libs/carfield_lib/include/carfield.h +++ b/examples/targets/carfield/libs/carfield_lib/include/carfield.h @@ -39,7 +39,7 @@ void carfield_free_ram(void* ext, size_t size); extern volatile uint32_t last_completed_node_id; extern volatile uint32_t last_task_error_code; -#define GLOBAL_IRQ_ENABLE 0x00001808 +#define GLOBAL_IRQ_ENABLE (1UL << 3) #define EXTERNAL_IRQ_ENABLE 0x00000800 #define PLIC_BASE_ADDRESS 0x04000000 diff --git a/examples/targets/carfield/libs/carfield_lib/include/cluster.h b/examples/targets/carfield/libs/carfield_lib/include/cluster.h index e6aa955..7f8230c 100644 --- a/examples/targets/carfield/libs/carfield_lib/include/cluster.h +++ b/examples/targets/carfield/libs/carfield_lib/include/cluster.h @@ -14,6 +14,10 @@ #include "pulp.h" #include "bench/bench.h" #include "pulp_nn/pulp_nn_kernels.h" + +typedef float16 fp16; +typedef fp16 v2f16 __attribute__((vector_size (4))); + #endif @@ -76,6 +80,8 @@ void pulp_nn_hoparallel_conv2d_wrapper(MatchCtx* ctx); void pulp_nn_add_wrapper(MatchCtx* ctx); +void pulp_nn_dense_fp16_wrapper(MatchCtx* ctx); + void pulp_nn_wrapper(MatchCtx* ctx); #endif // CAR_LIB_CLUSTER_H \ No newline at end of file diff --git a/examples/targets/carfield/libs/carfield_lib/src/carfield.c b/examples/targets/carfield/libs/carfield_lib/src/carfield.c index 40d7af4..981ca5c 100644 --- a/examples/targets/carfield/libs/carfield_lib/src/carfield.c +++ b/examples/targets/carfield/libs/carfield_lib/src/carfield.c @@ -115,10 +115,19 @@ static dif_rv_plic_t plic0; void carfield_init_plic() { // Reset PLIC dif_rv_plic_reset(&plic0); + // Set global interrupt enable in CVA6 csr - asm volatile("csrw mstatus, %0\n" : : "r"(GLOBAL_IRQ_ENABLE)); + unsigned long mstatus; + asm volatile ("csrr %0, mstatus" : "=r"(mstatus)); + mstatus |= GLOBAL_IRQ_ENABLE; + asm volatile ("csrw mstatus, %0" :: "r"(mstatus)); + // Set external interrupt enable in CVA6 csr - asm volatile("csrw mie, %0\n" : : "r"(EXTERNAL_IRQ_ENABLE)); + unsigned long mie; + asm volatile ("csrr %0, mie" : "=r"(mie)); + mie |= EXTERNAL_IRQ_ENABLE; + asm volatile ("csrw mie, %0" :: "r"(mie)); + // Setup PLIC mmio_region_t plic_base_addr = mmio_region_from_addr(PLIC_BASE_ADDRESS); dif_result_t t = dif_rv_plic_init(plic_base_addr, &plic0); diff --git a/examples/targets/carfield/libs/carfield_lib/src/cluster.c b/examples/targets/carfield/libs/carfield_lib/src/cluster.c index 3078f1e..cb8775b 100644 --- a/examples/targets/carfield/libs/carfield_lib/src/cluster.c +++ b/examples/targets/carfield/libs/carfield_lib/src/cluster.c @@ -5,6 +5,9 @@ #include "carfield_lib/mbox.h" #include "carfield_lib/utils.h" +#include "pulp_nn/pulp_nn_kernels.h" +#include "pulp_nn_fp16/pulp_nn_kernels_fp16.h" + //#define CLUSTER_LIB_DEBUG #define DEBUG_CALLOC_L1_SCRATCHPAD 0 #define DEBUG_BLOCKING_DMA 0 @@ -58,6 +61,11 @@ void cluster_sync_cores(MatchCtx* ctx) void cluster_lib_init(MatchCtx* ctx) { + #ifdef CLUSTER_LIB_DEBUG + for (int i = 0; i < 20000; i++) { + asm volatile("fence rw,rw":::"memory"); + } + #endif dma_transfer_ = dma_transfer_create(); #ifdef CLUSTER_LIB_DEBUG mini_printf("[PULP] Yo! Cluster is alive! DMA counter is %d\r\n", dma_transfer_); @@ -719,6 +727,36 @@ void pulp_nn_add_wrapper(MatchCtx* ctx){ ); } + +void pulp_nn_dense_fp16_wrapper(MatchCtx* ctx) { + MatchTensor* tensors = ctx->tensors->tensors; + int num_ops = ctx->ops->num_ops; + int num_tensors = ctx->tensors->num_tensors; + int out_ch = tensors[num_tensors-1].tiles[L1_SCRATCHPAD*2+1].size; + int inp_ch = tensors[0].tiles[L1_SCRATCHPAD*2+1].size; + #ifdef CLUSTER_LIB_DEBUG + if(rt_core_id() == 0) { + mini_printf("[PULP][KER] pulp_nn_linear_fp16: "); + mini_printf("Out. tile (%d,) | ", out_ch); + mini_printf("Inp. tile (%d,)\r\n", inp_ch); + } + #endif + pulp_nn_linear_fp16( + // activations pt + (float16*)tensors[0].pt, // acts pt + // weights pt + (float16*)tensors[1].pt, // weights pt + // output pt + (float16*)tensors[num_tensors-1].pt, // output pt + // bias pt + num_tensors>4 ? (float16*)NULL : (float16*)tensors[2].pt, // bias pt + // dims + inp_ch, + out_ch + ); +} + + void pulp_nn_wrapper(MatchCtx* ctx){ switch(ctx->pattern_name){ @@ -728,9 +766,9 @@ void pulp_nn_wrapper(MatchCtx* ctx){ case conv2d: pulp_nn_hoparallel_conv2d_wrapper(ctx); break; - case dense_out: - pulp_nn_dense_out_int_wrapper(ctx); - break; + //case dense_out: + // pulp_nn_dense_out_int_wrapper(ctx); + // break; // case pulp_nn_dw_conv2d_less_4_pattern: // pi_team_offload_preset(pulp_nn_dw_conv2d_less_4_wrapper, ctx); // break; @@ -743,6 +781,8 @@ void pulp_nn_wrapper(MatchCtx* ctx){ case add_requant: pulp_nn_add_wrapper(ctx); break; + case dense_fp16: + pulp_nn_dense_fp16_wrapper(ctx); default: break; } @@ -807,4 +847,14 @@ uint32_t cluster_timer_stop() { } + +double __attribute__((weak)) __extendhfdf2(float16 val) +{ + float res; + __asm__ __volatile__ ("fcvt.s.h %0, %1": "=f"(res): "f"(val) :); + return (double) res; +} + + + #endif \ No newline at end of file diff --git a/examples/targets/carfield/libs/carfield_lib/src/malloc.c b/examples/targets/carfield/libs/carfield_lib/src/malloc.c index 3b2561e..7273a59 100644 --- a/examples/targets/carfield/libs/carfield_lib/src/malloc.c +++ b/examples/targets/carfield/libs/carfield_lib/src/malloc.c @@ -1,9 +1,13 @@ -/** - * Malloc implementation for bare metal systems using linker-defined heap +/* + * Basic malloc implementation for L2 SPM, + * pointers are uint32_t so that, in theory, + * the same memory pool could be shared between + * 64bit host and 32bit cluster cores. */ #include "carfield_lib/malloc.h" #include "carfield_lib/carfield.h" +#include "carfield_lib/printf.h" #include #include @@ -12,126 +16,142 @@ uint8_t* memory_pool_l2 = &__l2_heap_start; block_header_t* free_list = NULL; + +static size_t l2_heap_size(void) { + return (size_t)(&__l2_heap_end - &__l2_heap_start); +} + +// Offset helpers +static inline uint32_t ptr_to_offset(void* ptr) { + if (!ptr) return 0; + return (uint32_t)((uint8_t*)ptr - memory_pool_l2); +} + +static inline block_header_t* offset_to_ptr(uint32_t offset) { + if (!offset) return NULL; + return (block_header_t*)(memory_pool_l2 + offset); +} + /** * Initialize the memory allocator */ void mem_init_l2(void) { // Create initial free block spanning the entire memory pool free_list = (block_header_t*)memory_pool_l2; - free_list->size = (uint32_t)(&__l2_heap_end - &__l2_heap_start); + free_list->size = (uint32_t)l2_heap_size(); free_list->is_free = 1; free_list->next = 0; + + mini_printf("[MALLOC] L2 memory pool initialized with size %d bytes.\r\n", (int)free_list->size); } /** * Allocate memory of specified size - * + * * @param size Size of memory to allocate in bytes * @return Pointer to allocated memory or NULL if allocation fails */ void* malloc_l2(size_t size) { - carprint("malloc\r\n"); block_header_t *curr, *prev, *new_block; void* result = NULL; - + // Adjust size to include the header and ensure alignment (8-byte in this case) size_t aligned_size = (size + sizeof(block_header_t) + 7) & ~7; - + // Ensure minimum allocation size if (aligned_size < MIN_ALLOC_SIZE + sizeof(block_header_t)) aligned_size = MIN_ALLOC_SIZE + sizeof(block_header_t); - + // Initialize memory pool if not already done if (free_list == NULL) mem_init_l2(); - + // First-fit search for a free block prev = NULL; curr = free_list; - + while (curr != NULL) { if (curr->is_free && curr->size >= aligned_size) { // Found a suitable block - + // Split the block if it's significantly larger than requested if (curr->size >= aligned_size + sizeof(block_header_t) + MIN_ALLOC_SIZE) { new_block = (block_header_t*)((uint8_t*)curr + aligned_size); new_block->size = curr->size - aligned_size; new_block->is_free = 1; new_block->next = curr->next; - + curr->size = aligned_size; - curr->next = (uint32_t)new_block; + curr->next = ptr_to_offset(new_block); } - + // Mark block as allocated curr->is_free = 0; - + // Return pointer to usable memory (after header) result = (void*)((uint8_t*)curr + sizeof(block_header_t)); break; } - + prev = curr; - curr = (block_header_t*)curr->next; + curr = offset_to_ptr(curr->next); } - + + mini_printf("[MALLOC] Allocated %d bytes block in L2 at %p.\r\n", (int)size, result); return result; } /** * Free previously allocated memory - * + * * @param ptr Pointer to memory to free */ void free_l2(void* ptr) { block_header_t *block, *next, *prev; - + if (ptr == NULL) return; - + // Get the block header from the pointer block = (block_header_t*)((uint8_t*)ptr - sizeof(block_header_t)); - + // Sanity check - ensure the pointer is within our heap - if ((uint8_t*)block < memory_pool_l2 || - (uint8_t*)block >= memory_pool_l2 + (&__l2_heap_end - &__l2_heap_start)) + if ((uint8_t*)block < memory_pool_l2 || + (uint8_t*)block >= memory_pool_l2 + l2_heap_size()) return; // Ignore attempts to free memory outside our heap - + // Mark block as free block->is_free = 1; - + // Coalesce with adjacent free blocks - + // Find the previous block prev = NULL; next = free_list; while (next != NULL && next < block) { prev = next; - next = (block_header_t*)next->next; + next = offset_to_ptr(next->next); } - + // Merge with next block if adjacent and free - if ((uint8_t*)block + block->size == (uint8_t*)next && next->is_free) { + if (next && ((uint8_t*)block + block->size == (uint8_t*)next) && next->is_free) { block->size += next->size; block->next = next->next; } else { - block->next = (uint32_t)next; + block->next = ptr_to_offset(next); } - + // Merge with previous block if adjacent and free - if (prev != NULL && (uint8_t*)prev + prev->size == (uint8_t*)block && prev->is_free) { + if (prev && ((uint8_t*)prev + prev->size == (uint8_t*)block) && prev->is_free) { prev->size += block->size; prev->next = block->next; - } else if (prev != NULL) { - prev->next = (uint32_t)block; + } else if (prev) { + prev->next = ptr_to_offset(block); } else { free_list = block; } } -// [calloc and realloc implementations remain the same as before] - void* malloc(size_t size) { return malloc_l2(size); diff --git a/examples/targets/carfield/libs/carfield_lib/src/printf.c b/examples/targets/carfield/libs/carfield_lib/src/printf.c index 1b925cd..7dd3bfd 100644 --- a/examples/targets/carfield/libs/carfield_lib/src/printf.c +++ b/examples/targets/carfield/libs/carfield_lib/src/printf.c @@ -3,10 +3,11 @@ #include #include #include +#include #include "carfield_lib/uart.h" -// Convert integers to strings with support for different sizes + void mini_itoa(int value, char *str, int base) { char *ptr = str, *ptr1 = str, tmp_char; int tmp_value; @@ -32,7 +33,7 @@ void mini_itoa(int value, char *str, int base) { } } -// Convert unsigned long integers to strings (for pointers) + void mini_ultoa(unsigned long value, char *str, int base) { char *ptr = str, *ptr1 = str, tmp_char; unsigned long tmp_value; @@ -53,15 +54,61 @@ void mini_ultoa(unsigned long value, char *str, int base) { } +static void mini_ftoa(double f, char *buf, int precision) { + if (isnan(f)) { + buf[0] = 'n'; buf[1] = 'a'; buf[2] = 'n'; buf[3] = '\0'; + return; + } + if (isinf(f)) { + if (f < 0) { + buf[0] = '-'; buf[1] = 'i'; buf[2] = 'n'; buf[3] = 'f'; buf[4] = '\0'; + } else { + buf[0] = 'i'; buf[1] = 'n'; buf[2] = 'f'; buf[3] = '\0'; + } + return; + } + if (f < 0) { + *buf++ = '-'; + f = -f; + } + unsigned long ipart = (unsigned long)f; + double fpart = f - (double)ipart; + + // Integer part + char tmp[20]; + mini_ultoa(ipart, tmp, 10); + char *p = tmp; + while (*p) *buf++ = *p++; + + // Decimal point and fractional part + if (precision > 0) { + *buf++ = '.'; + // Multiply out for specified precision, round correctly + double rounding = 0.5; + for (int i = 0; i < precision; ++i) + rounding /= 10.0; + fpart += rounding; + + for (int i = 0; i < precision; ++i) { + fpart *= 10.0; + int digit = (int)fpart; + *buf++ = '0' + digit; + fpart -= digit; + } + } + *buf = '\0'; +} + + size_t mini_vsnprintf(char *out, size_t n, const char *fmt, va_list args) { char *out_ptr = out; size_t remaining = n; - if (n == 0) return 0; // Handle zero-sized buffer + if (n == 0) return 0; - char buffer[20]; // Increased to handle 64-bit pointers (16 hex digits + null terminator) - - while (*fmt && remaining > 1) { // Keep space for null terminator + char buffer[32]; + + while (*fmt && remaining > 1) { if (*fmt == '%') { fmt++; if (*fmt == 'd' || *fmt == 'i') { @@ -73,13 +120,12 @@ size_t mini_vsnprintf(char *out, size_t n, const char *fmt, va_list args) { } } else if (*fmt == 's') { char *str = va_arg(args, char*); - if (str) { // Check for NULL pointer + if (str) { while (*str && remaining > 1) { *out_ptr++ = *str++; remaining--; } } else { - // Handle NULL string const char *null_str = "NULL"; for (const char *p = null_str; *p && remaining > 1; p++) { *out_ptr++ = *p; @@ -89,7 +135,6 @@ size_t mini_vsnprintf(char *out, size_t n, const char *fmt, va_list args) { } else if (*fmt == 'x') { unsigned int val = va_arg(args, unsigned int); mini_itoa(val, buffer, 16); - // Only add 0x prefix if there's room if (remaining > 2) { *out_ptr++ = '0'; remaining--; *out_ptr++ = 'x'; remaining--; @@ -99,70 +144,59 @@ size_t mini_vsnprintf(char *out, size_t n, const char *fmt, va_list args) { } } } else if (*fmt == 'p' || *fmt == 'P') { - // Handle pointer type with proper casting void *ptr = va_arg(args, void*); if (ptr == NULL) { - // Handle NULL pointer const char *null_ptr = "NULL"; for (const char *p = null_ptr; *p && remaining > 1; p++) { *out_ptr++ = *p; remaining--; } } else { - // Convert pointer to hex representation with proper size unsigned long ptr_val = (unsigned long)ptr; mini_ultoa(ptr_val, buffer, 16); - - // Add leading zeros to ensure consistent width + int len = 0; for (char *p = buffer; *p; p++) len++; - - // Add 0x prefix and pad with zeros if there's room + if (remaining > 2) { *out_ptr++ = '0'; remaining--; *out_ptr++ = 'x'; remaining--; - - // Add padding zeros for consistent pointer width - // For 32-bit: 8 hex digits, For 64-bit: 16 hex digits int target_width; - - // Determine if we're using a 32-bit or 64-bit pointer int is_64bit = 0; #if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_M_X64) is_64bit = 1; #endif - - // When using %p, print as 32-bit (8 hex digits) - // When using %P, print as 64-bit (16 hex digits) - // Or when using %p but the pointer requires 64-bit representation if (*fmt == 'P' || (is_64bit && len > 8)) { - target_width = 16; // 64-bit format + target_width = 16; } else { - target_width = 8; // 32-bit format + target_width = 8; } - int padding = target_width - len; while (padding > 0 && remaining > 1) { *out_ptr++ = '0'; remaining--; padding--; } - - // Add the actual hex digits for (char *p = buffer; *p && remaining > 1; p++) { *out_ptr++ = *p; remaining--; } } } + } else if (*fmt == 'f') { + // Default: 6 decimal places + double val = va_arg(args, double); + mini_ftoa(val, buffer, 6); + for (char *p = buffer; *p && remaining > 1; p++) { + *out_ptr++ = *p; + remaining--; + } } else if (*fmt == '%') { - // Handle %% escape if (remaining > 1) { *out_ptr++ = '%'; remaining--; } } else { - // Unknown specifier, print as is if (remaining > 1) { *out_ptr++ = '%'; remaining--; } @@ -176,9 +210,8 @@ size_t mini_vsnprintf(char *out, size_t n, const char *fmt, va_list args) { } fmt++; } - *out_ptr = '\0'; // Null-terminate - - return n - remaining; // Return number of characters written (not including null terminator) + *out_ptr = '\0'; + return n - remaining; } diff --git a/examples/targets/carfield/libs/pulp_nn_fp16/include/pulp_nn_kernels_fp16.h b/examples/targets/carfield/libs/pulp_nn_fp16/include/pulp_nn_kernels_fp16.h new file mode 100644 index 0000000..1d70f7b --- /dev/null +++ b/examples/targets/carfield/libs/pulp_nn_fp16/include/pulp_nn_kernels_fp16.h @@ -0,0 +1,14 @@ +#ifdef __pulp_cluster__ + +#include + +void pulp_nn_linear_fp16( + float16 *__restrict__ input, + float16 *__restrict__ weight, + float16 *__restrict__ output, + float16 *__restrict__ bias, + uint32_t dim_i, + uint32_t dim_o +); + +#endif \ No newline at end of file diff --git a/examples/targets/carfield/libs/pulp_nn_fp16/src/pulp_nn_linear_fp16.c b/examples/targets/carfield/libs/pulp_nn_fp16/src/pulp_nn_linear_fp16.c new file mode 100644 index 0000000..3852fed --- /dev/null +++ b/examples/targets/carfield/libs/pulp_nn_fp16/src/pulp_nn_linear_fp16.c @@ -0,0 +1,37 @@ +#ifdef __pulp_cluster__ + +#include "pulp_nn_fp16/pulp_nn_kernels_fp16.h" + +#include + +#include + +#define log2(x) __builtin_pulp_fl1(x) +#define min(a,b) ((a)<(b)?(a):(b)) + + +void pulp_nn_linear_fp16( + float16 *__restrict__ input, + float16 *__restrict__ weight, + float16 *__restrict__ output, + float16 *__restrict__ bias, + uint32_t dim_i, + uint32_t dim_o +) +{ + const int NUM_CORES = get_core_num(); + + int chunk = (dim_o >> log2(NUM_CORES)) + ((dim_o & (NUM_CORES - 1)) != 0); + int start = min(chunk * rt_core_id(), dim_o); + int stop = min(start + chunk, dim_o); + + for (int j = start; j < stop; j++) { + float16 sum = bias ? bias[j] : 0; + for (int k = 0; k < dim_i; k++) { + sum += input[k] * weight[j * dim_i + k]; + } + output[j] = sum; + } +} + +#endif \ No newline at end of file diff --git a/examples/targets/carfield/model_fp16/model_graph.relay b/examples/targets/carfield/model_fp16/model_graph.relay new file mode 100644 index 0000000..420db91 --- /dev/null +++ b/examples/targets/carfield/model_fp16/model_graph.relay @@ -0,0 +1,14 @@ +#[version = "0.0.5"] +def @main(%input_0: Tensor[(1, 3), float16], %dense_1_weights: Tensor[(3872, 3), float16], %dense_1_bias: Tensor[(3872), float16], %conv_weights: Tensor[(3, 8, 3, 3), float16], %conv_bias: Tensor[(3), float16], %dense_2_weights: Tensor[(8, 363), float16], %dense_2_bias: Tensor[(8), float16]) { + %0 = nn.dense(%input_0, %dense_1_weights, units=None, out_dtype="float16"); + %1 = nn.bias_add(%0, %dense_1_bias, axis=-1); + %2 = nn.relu(%1); + %3 = reshape(%2, newshape=[1, 8, 22, 22]); + %4 = nn.conv2d(%3, %conv_weights, strides=[2, 2], padding=[1, 1, 1, 1], kernel_size=[3, 3]); + %5 = nn.bias_add(%4, %conv_bias); + %6 = nn.relu(%5); + %7 = reshape(%6, newshape=[1, 363]); + %8 = nn.dense(%7, %dense_2_weights, units=None, out_dtype="float16"); + %9 = nn.bias_add(%8, %dense_2_bias, axis=-1); + nn.relu(%9) +} diff --git a/examples/targets/carfield/model_fp16/model_params.txt b/examples/targets/carfield/model_fp16/model_params.txt new file mode 100644 index 0000000..487f6db Binary files /dev/null and b/examples/targets/carfield/model_fp16/model_params.txt differ diff --git a/examples/targets/carfield/pulp_cluster.py b/examples/targets/carfield/pulp_cluster.py index f679f2a..f1ae5f3 100644 --- a/examples/targets/carfield/pulp_cluster.py +++ b/examples/targets/carfield/pulp_cluster.py @@ -9,7 +9,7 @@ from match.cost_model.examples.pulp_cluster import PulpClusterCostModel from match.target.memory_inst import MemoryInst from match.tensor.tensor import MatchTensor -from tvm.relay.dataflow_pattern import wildcard, is_op, is_constant +from tvm.relay.dataflow_pattern import wildcard, is_op, is_constant, has_dtype from match.partition.partitioning_pattern import PartitioningPattern class PulpCluster(ExecModule): @@ -19,6 +19,7 @@ def __init__(self, num_cores: int=8, l1_kb_size: int=64, l2_kb_size: int=512, libs_required={ "carfield_lib": ModuleLib(name="carfield_lib", base_path=os.path.dirname(__file__)+"/libs/carfield_lib"), "pulp_nn": ModuleLib(name="pulp_nn", base_path=os.path.dirname(__file__)+"/libs/pulp_nn"), + "pulp_nn_fp16": ModuleLib(name="pulp_nn", base_path=os.path.dirname(__file__)+"/libs/pulp_nn_fp16"), }) self.NUM_CORES = num_cores self.L1_SCRATCHPAD_KB_SIZE = l1_kb_size @@ -188,6 +189,11 @@ def dense_pt_out(): add = is_op("add")(dense, is_constant()) | is_op("add")(is_op("cast")(dense),is_constant()) return add + def dense_fp16(): + dense = is_op("nn.dense")(wildcard(), wildcard()) + dense_add = is_op("add")(dense, is_constant()) + return dense_add + def add_pt_requant(): cast_a = is_op("cast")(wildcard()) cast_b = is_op("cast")(wildcard()) @@ -201,6 +207,11 @@ def add_pt_requant(): def only_out_uint8(node): return add_checks_get_first_op(node, "cast").attrs.dtype=="uint8" + + def only_out_fp16(node): + is_fp16 = add_checks_get_first_op(node, "nn.dense").attrs.out_dtype == "float16" + is_fp16 |= getattr(node.attrs, "out_dtype", None) == "float16" + return is_fp16 def only_std_convs(node): conv = add_checks_get_first_op(node, "nn.conv2d") @@ -239,7 +250,8 @@ def only_dw_convs(node): return True return [ - PartitioningPattern(name="dense_out",pattern=dense_pt_out), + #PartitioningPattern(name="dense_out",pattern=dense_pt_out), + PartitioningPattern(name="dense_fp16",pattern=dense_fp16,additional_checks=only_out_fp16), PartitioningPattern(name="dense",pattern=dense_pt_requant,additional_checks=only_out_uint8), PartitioningPattern(name="conv2d",pattern=conv_pt_requant,additional_checks=only_std_convs), PartitioningPattern(name="depthwise_conv2d",pattern=conv_pt_requant,additional_checks=only_dw_convs), diff --git a/examples/targets/carfield/run_fp.py b/examples/targets/carfield/run_fp.py new file mode 100644 index 0000000..c13eadb --- /dev/null +++ b/examples/targets/carfield/run_fp.py @@ -0,0 +1,33 @@ +import sys + +MATCH_PATH = "../../../.." +sys.path.append(f"{MATCH_PATH}/match/match-tvm/python") +sys.path.append(f"{MATCH_PATH}/match/zigzag") +sys.path.append(f"{MATCH_PATH}/match") +sys.path.append(".") + +import match +from match.utils.utils import get_default_inputs +from match.model.model import MatchModel +from carfield import Carfield + +INPUT_FILE_PATH = "model_fp/input.txt" +RELAY_FILE_PATH = "model_fp16/model_graph.relay" +RELAY_PARAMS_PATH = "model_fp16/model_params.txt" +OUTPUT_DIR = "output_fp" + +relay_mod, relay_params = match.get_relay_network(input_type="relay", filename=RELAY_FILE_PATH, params_filename=RELAY_PARAMS_PATH) + +oenne_model = MatchModel( + relay_mod = relay_mod, + relay_params = relay_params, + model_name = "model", + default_inputs = get_default_inputs(mod=relay_mod, params=relay_params, input_files=[INPUT_FILE_PATH]), + #handle_out_fn="handle_int_classifier", + debug=True +) +match.match( + model = oenne_model, + target = Carfield(), + output_path = OUTPUT_DIR, +) \ No newline at end of file diff --git a/match/cost_model/examples/pulp_cluster.py b/match/cost_model/examples/pulp_cluster.py index 75a27d4..e7c2ea3 100644 --- a/match/cost_model/examples/pulp_cluster.py +++ b/match/cost_model/examples/pulp_cluster.py @@ -94,8 +94,10 @@ def adjust_temporal_mapping(self, temporal_mapping_dict, operand_list, layer): min_innermost_loops=min([len(temporal_mapping_dict[operand][0]) for operand in operand_list]) new_innermost_loops=min_innermost_loops max_tile_found=False + c_k_mapping = "C" in layer.layer_attrs["operand_source_dimension_mapping"]["I"] and layer.layer_attrs["operand_source_dimension_mapping"]["I"]["C"]=="K" + ACCEPTED_UNEVEN_TILE_DIMENSIONS_ACT_OUT = ("K", "C") if not c_k_mapping else ("C",) for idx in range(min_innermost_loops, len(temporal_mapping_dict["I"][0])): - if (not max_tile_found) and (temporal_mapping_dict["I"][0][idx][0] in self.ACCEPTED_UNEVEN_TILE_DIMENSIONS_ACT_OUT): + if (not max_tile_found) and (temporal_mapping_dict["I"][0][idx][0] in ACCEPTED_UNEVEN_TILE_DIMENSIONS_ACT_OUT): new_innermost_loops=idx+1 else: max_tile_found = True