From c29c202ff8ff8d1dc5d4c3dcce0bd89ca171ae46 Mon Sep 17 00:00:00 2001 From: LuigiAltamura Date: Wed, 15 Nov 2023 21:44:53 +0100 Subject: [PATCH 1/6] Added MEATRO-like mapping as a result --- src/GAMMA/gamma.py | 12 +++++++++--- src/GAMMA/train.py | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/GAMMA/gamma.py b/src/GAMMA/gamma.py index e9182d0..566a3df 100755 --- a/src/GAMMA/gamma.py +++ b/src/GAMMA/gamma.py @@ -842,10 +842,16 @@ def get_CONVtypeShape(self, dimensions, CONVtype=1): print("Not supported layer.") return dimensions - def write_maestro(self, indv, layer_id=0, m_file=None): + def write_maestro(self, indv, layer_id=0, m_file = None, folder_path = None): dimensions = [self.dimension] - with open("{}.m".format(m_file), "w") as fo: - fo.write("Network {} {{\n".format(layer_id)) + if layer_id != 0: + m_file_with_layer = "{}_{}".format(m_file, layer_id) + else: + m_file_with_layer = m_file + + file_path = os.path.join(folder_path or ".", "{}.m".format(m_file_with_layer)) + with open(file_path, "w") as fo: + fo.write("Network {} {{\n".format(m_file)) for i in range(len(dimensions)): dimension = dimensions[i] m_type = m_type_dicts[int(dimension[-1])] diff --git a/src/GAMMA/train.py b/src/GAMMA/train.py index 284496a..83cb70d 100644 --- a/src/GAMMA/train.py +++ b/src/GAMMA/train.py @@ -91,6 +91,7 @@ def train_model(model_defs, input_arg, map_cstr=None, chkpt_file='./chkpt'): np_array = np.array([chkpt[t] for t in columns[:-1]] + [f'{chkpt["best_sol"]}']).reshape(1, -1) df = pd.DataFrame(np_array, columns=columns) df.to_csv(chkpt_file[:-4]+".csv") + env.write_maestro(best_sol, m_file = opt.model, layer_id = opt.singlelayer, folder_path=os.path.dirname(chkpt_file)) with open(chkpt_file, "wb") as fd: pickle.dump(chkpt, fd) From ddd2d2a4f634fb9f43671da0b8de66b84c3e4413 Mon Sep 17 00:00:00 2001 From: LuigiAltamura Date: Wed, 22 Nov 2023 17:57:45 +0100 Subject: [PATCH 2/6] Added Scritp --- run_digamma.sh | 12 ++++++++++++ run_gamma_map_cstr.sh | 10 ++++++++++ script/avg_exec_time.sh | 24 ++++++++++++++++++++++++ script/avg_exec_time_digamma.sh | 24 ++++++++++++++++++++++++ script/avg_exec_time_map_cstr.sh | 24 ++++++++++++++++++++++++ script/avg_exec_time_with_hwconfig.sh | 24 ++++++++++++++++++++++++ script/delete_file.sh | 12 ++++++++++++ script/run_all.sh | 3 +++ script/run_digamma.sh | 12 ++++++++++++ script/run_gamma_map_cstr.sh | 10 ++++++++++ 10 files changed, 155 insertions(+) create mode 100755 run_digamma.sh create mode 100755 run_gamma_map_cstr.sh create mode 100755 script/avg_exec_time.sh create mode 100755 script/avg_exec_time_digamma.sh create mode 100755 script/avg_exec_time_map_cstr.sh create mode 100755 script/avg_exec_time_with_hwconfig.sh create mode 100755 script/delete_file.sh create mode 100755 script/run_all.sh create mode 100755 script/run_digamma.sh create mode 100755 script/run_gamma_map_cstr.sh diff --git a/run_digamma.sh b/run_digamma.sh new file mode 100755 index 0000000..d2c5438 --- /dev/null +++ b/run_digamma.sh @@ -0,0 +1,12 @@ +cd ./src/GAMMA + + +python main.py --num_pe -1 --area_budget 0.2 --pe_limit 200 --model vgg16 --outdir outdir_digamma + +cd ../../ + + + + + + diff --git a/run_gamma_map_cstr.sh b/run_gamma_map_cstr.sh new file mode 100755 index 0000000..f0278ad --- /dev/null +++ b/run_gamma_map_cstr.sh @@ -0,0 +1,10 @@ +cd ./src/GAMMA +python main.py --mapping_cstr dla_map --fitness1 latency --fitness2 power --num_pe 168 --l1_size 512 --l2_size 108000 --NocBW 81920000 --epochs 10 \ + --model vgg16 --outdir outdir_map +cd ../../ + + + + + + diff --git a/script/avg_exec_time.sh b/script/avg_exec_time.sh new file mode 100755 index 0000000..fb02bf5 --- /dev/null +++ b/script/avg_exec_time.sh @@ -0,0 +1,24 @@ +#!/bin/bash +cd .. +# Number of executions +num_executions=10 +total_execution_time_ns=0 + +for ((i=1; i<=$num_executions; i++)); do + + start_time=$(perl -MTime::HiRes -e 'printf("%.0f\n",Time::HiRes::time()*1000)') + + ./run_gamma.sh + + end_time=$(perl -MTime::HiRes -e 'printf("%.0f\n",Time::HiRes::time()*1000)') + + elapsed_time_ms=$((end_time - start_time)) + + total_execution_time_ms=$((total_execution_time_ms + elapsed_time_ms)) +done + + +average_execution_time_ms=$((total_execution_time_ms / num_executions)) + +# Display the total execution time in nanoseconds +echo "Gamma -> Avg Execution Time: $average_execution_time_ms ms" >> "output.txt" diff --git a/script/avg_exec_time_digamma.sh b/script/avg_exec_time_digamma.sh new file mode 100755 index 0000000..631d391 --- /dev/null +++ b/script/avg_exec_time_digamma.sh @@ -0,0 +1,24 @@ +#!/bin/bash +cd .. +# Number of executions +num_executions=10 +total_execution_time_ns=0 + +for ((i=1; i<=$num_executions; i++)); do + + start_time=$(perl -MTime::HiRes -e 'printf("%.0f\n",Time::HiRes::time()*1000)') + + ./run_digamma.sh + + end_time=$(perl -MTime::HiRes -e 'printf("%.0f\n",Time::HiRes::time()*1000)') + + elapsed_time_ms=$((end_time - start_time)) + + total_execution_time_ms=$((total_execution_time_ms + elapsed_time_ms)) +done + + +average_execution_time_ms=$((total_execution_time_ms / num_executions)) + +# Display the total execution time in nanoseconds +echo "DiGamma -> Avg Execution Time: $average_execution_time_ms ms" >> "output.txt" diff --git a/script/avg_exec_time_map_cstr.sh b/script/avg_exec_time_map_cstr.sh new file mode 100755 index 0000000..88f03f5 --- /dev/null +++ b/script/avg_exec_time_map_cstr.sh @@ -0,0 +1,24 @@ +#!/bin/bash +cd .. +# Number of executions +num_executions=10 +total_execution_time_ns=0 + +for ((i=1; i<=$num_executions; i++)); do + + start_time=$(perl -MTime::HiRes -e 'printf("%.0f\n",Time::HiRes::time()*1000)') + + ./run_gamma_map_cstr.sh + + end_time=$(perl -MTime::HiRes -e 'printf("%.0f\n",Time::HiRes::time()*1000)') + + elapsed_time_ms=$((end_time - start_time)) + + total_execution_time_ms=$((total_execution_time_ms + elapsed_time_ms)) +done + + +average_execution_time_ms=$((total_execution_time_ms / num_executions)) + +# Display the total execution time in nanoseconds +echo "Map Cstr -> Avg Execution Time: $average_execution_time_ms ms" >> "output.txt" diff --git a/script/avg_exec_time_with_hwconfig.sh b/script/avg_exec_time_with_hwconfig.sh new file mode 100755 index 0000000..9f4b8c8 --- /dev/null +++ b/script/avg_exec_time_with_hwconfig.sh @@ -0,0 +1,24 @@ +#!/bin/bash +cd .. +# Number of executions +num_executions=10 +total_execution_time_ns=0 + +for ((i=1; i<=$num_executions; i++)); do + + start_time=$(perl -MTime::HiRes -e 'printf("%.0f\n",Time::HiRes::time()*1000)') + + ./run_gamma_with_hwconfig.sh + + end_time=$(perl -MTime::HiRes -e 'printf("%.0f\n",Time::HiRes::time()*1000)') + + elapsed_time_ms=$((end_time - start_time)) + + total_execution_time_ms=$((total_execution_time_ms + elapsed_time_ms)) +done + + +average_execution_time_ms=$((total_execution_time_ms / num_executions)) + +# Display the total execution time in nanoseconds +echo "Av Execution Time: $average_execution_time_ms ms" diff --git a/script/delete_file.sh b/script/delete_file.sh new file mode 100755 index 0000000..5014a05 --- /dev/null +++ b/script/delete_file.sh @@ -0,0 +1,12 @@ +folder_path="$HOME/Desktop/Tesi/gamma/src/GAMMA" + +# Navigate to the folder +cd "$folder_path" || exit + +# Delete .csv files +find . -type f -name "*.csv" -delete + +# Delete .m files +find . -type f -name "*.m" -delete + +echo "Deletion complete for .csv and .m files in $folder_path" \ No newline at end of file diff --git a/script/run_all.sh b/script/run_all.sh new file mode 100755 index 0000000..176256f --- /dev/null +++ b/script/run_all.sh @@ -0,0 +1,3 @@ +./avg_exec_time.sh +./avg_exec_time_map_cstr.sh +./avg_exec_time_digamma.sh \ No newline at end of file diff --git a/script/run_digamma.sh b/script/run_digamma.sh new file mode 100755 index 0000000..d2c5438 --- /dev/null +++ b/script/run_digamma.sh @@ -0,0 +1,12 @@ +cd ./src/GAMMA + + +python main.py --num_pe -1 --area_budget 0.2 --pe_limit 200 --model vgg16 --outdir outdir_digamma + +cd ../../ + + + + + + diff --git a/script/run_gamma_map_cstr.sh b/script/run_gamma_map_cstr.sh new file mode 100755 index 0000000..f0278ad --- /dev/null +++ b/script/run_gamma_map_cstr.sh @@ -0,0 +1,10 @@ +cd ./src/GAMMA +python main.py --mapping_cstr dla_map --fitness1 latency --fitness2 power --num_pe 168 --l1_size 512 --l2_size 108000 --NocBW 81920000 --epochs 10 \ + --model vgg16 --outdir outdir_map +cd ../../ + + + + + + From 5fadd616c84834fc59b094246e03835b5594224e Mon Sep 17 00:00:00 2001 From: LuigiAltamura Date: Tue, 16 Jan 2024 16:45:42 +0100 Subject: [PATCH 3/6] Add multilayer support Now GAMMA supports multilayer analysis. Solved bug that overwrite the .csv result. Now there is a .csv for each layer --- requirements.txt | 4 ---- run_gamma.sh | 2 +- src/GAMMA/main.py | 2 +- src/GAMMA/train.py | 29 ++++++++++++++++++++++------- 4 files changed, 24 insertions(+), 13 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 73d4c0a..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -numpy==1.18.5 -matplotlib==3.2.1 -pandas==1.0.4 -nevergrad==0.4.2 \ No newline at end of file diff --git a/run_gamma.sh b/run_gamma.sh index 23dc76f..fbc9e48 100755 --- a/run_gamma.sh +++ b/run_gamma.sh @@ -1,6 +1,6 @@ cd ./src/GAMMA python main.py --fitness1 latency --fitness2 power --num_pe 168 --l1_size 512 --l2_size 108000 --NocBW 81920000 --epochs 10 \ - --model vgg16 --singlelayer 1 + --model vgg16 --num_layer 13 #--singlelayer 1 cd ../../ diff --git a/src/GAMMA/main.py b/src/GAMMA/main.py index 9600ede..bd7f0ea 100644 --- a/src/GAMMA/main.py +++ b/src/GAMMA/main.py @@ -15,7 +15,7 @@ parser.add_argument('--offchipBW', type=int, default=-1, help='Off-chip BW') parser.add_argument('--hwconfig', type=str, default=None, help='HW configuration file') parser.add_argument('--model', type=str, default="resnet18", help='Model to run') - parser.add_argument('--num_layer', type=int, default=2, help='Number of layers to optimize') + parser.add_argument('--num_layer', type=int, default=0, help='Number of layers to optimize') parser.add_argument('--singlelayer', type=int, default=0, help='The layer index to optimize') parser.add_argument('--slevel_min', type=int, default=2, help='Minimum number of parallelization level') parser.add_argument('--slevel_max', type=int, default=2, help='Maximum number of parallelization level') diff --git a/src/GAMMA/train.py b/src/GAMMA/train.py index 83cb70d..d89bf19 100644 --- a/src/GAMMA/train.py +++ b/src/GAMMA/train.py @@ -64,6 +64,8 @@ def train_model(model_defs, input_arg, map_cstr=None, chkpt_file='./chkpt'): l2_size=opt.l2_size, NocBW=opt.NocBW, offchipBW=opt.offchipBW, slevel_min=opt.slevel_min, slevel_max=opt.slevel_max, fixedCluster=opt.fixedCluster, log_level=opt.log_level, map_cstr=map_cstr) constraints = {"area":opt.area_budget* 1e6} + chkpt_list = [] + num_layer = 1 for dimension in model_defs: env.reset_dimension(fitness=fitness, constraints=constraints, dimension=dimension) env.reset_hw_parm(num_pe=opt.num_pe, l1_size=opt.l1_size, l2_size=opt.l2_size, pe_limit=opt.pe_limit,area_pebuf_only=False, external_area_model=True) @@ -87,13 +89,26 @@ def train_model(model_defs, input_arg, map_cstr=None, chkpt_file='./chkpt'): "L1_size": best_l1_size, "L2_size": best_l2_size } - columns = ["runtime", "area", "pe_area_ratio", "PE", "L1_size", "L2_size", "PE_area", "L1_area", "L2_area","best_sol"] - np_array = np.array([chkpt[t] for t in columns[:-1]] + [f'{chkpt["best_sol"]}']).reshape(1, -1) - df = pd.DataFrame(np_array, columns=columns) - df.to_csv(chkpt_file[:-4]+".csv") - env.write_maestro(best_sol, m_file = opt.model, layer_id = opt.singlelayer, folder_path=os.path.dirname(chkpt_file)) - with open(chkpt_file, "wb") as fd: - pickle.dump(chkpt, fd) + chkpt_list.append(chkpt) + if opt.num_layer != 0: + env.write_maestro(best_sol, m_file = opt.model, layer_id = num_layer, folder_path=os.path.dirname(chkpt_file)) + else: + env.write_maestro(best_sol, m_file = opt.model, layer_id = opt.singlelayer, folder_path=os.path.dirname(chkpt_file)) + + num_layer += 1 + + columns = ["runtime", "area", "pe_area_ratio", "PE", "L1_size", "L2_size", "PE_area", "L1_area", "L2_area","best_sol"] + np_array = None + for chkpt in chkpt_list: + if np_array is None: + np_array = np.array([chkpt[t] for t in columns[:-1]] + [f'{chkpt["best_sol"]}']).reshape(1, -1) + else: + np_array = np.vstack([np_array, np.array([chkpt[t] for t in columns[:-1]] + [f'{chkpt["best_sol"]}']).reshape(1, -1)]) + df = pd.DataFrame(np_array, columns=columns) + df.to_csv(chkpt_file[:-4]+".csv") + + with open(chkpt_file, "wb") as fd: + pickle.dump(chkpt_list, fd) def get_cstr_name(mapping_cstr): if mapping_cstr: From 9d9d1ee46a47b32c10ba2ff7299c459016f82402 Mon Sep 17 00:00:00 2001 From: LuigiAltamura Date: Mon, 22 Apr 2024 10:49:31 +0200 Subject: [PATCH 4/6] Add extract_dataflow.py --- extract_dataflow.py | 52 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 extract_dataflow.py diff --git a/extract_dataflow.py b/extract_dataflow.py new file mode 100644 index 0000000..399ef2a --- /dev/null +++ b/extract_dataflow.py @@ -0,0 +1,52 @@ +import os +import argparse + +def extract_dataflow(file_name, output_folder): + # Read the content of the original file + with open(file_name, 'r') as f: + content = f.read() + + # Find the starting and ending indices of the Dataflow section + start_index = content.find("Dataflow {") + end_index = content.find("}", start_index) + 1 + + # Extract the Dataflow section + dataflow_section = content[start_index:end_index] + + # Add a new line after the closing parenthesis + dataflow_section += '\n' + + # Add a tab of shift to each line inside the Dataflow section, excluding the first and last lines + lines = dataflow_section.split('\n') + indented_lines = '\n'.join(lines[0:1] + ['\t' + line for line in lines[1:-2]] + lines[-2:]) + + # Create the output folder if it doesn't exist + if not os.path.exists(output_folder): + os.makedirs(output_folder) + + # Construct the output file path without the "_dataflow" suffix + output_file_name = os.path.splitext(os.path.basename(file_name))[0].replace('_dataflow', '') + output_file_path = os.path.join(output_folder, output_file_name + '.m') + + # Write the modified Dataflow content to the new file + with open(output_file_path, 'w') as f: + # Write the modified Dataflow section + f.write(indented_lines) + +if __name__ == "__main__": + # Create an argument parser + parser = argparse.ArgumentParser(description='Extract Dataflow from a file and save to a new file.') + + # Add the file and output folder arguments + parser.add_argument('--file', type=str, help='Path of the input file') + parser.add_argument('--out', type=str, help='Path of the output folder') + + # Parse the command-line arguments + args = parser.parse_args() + + # Check if both --file and --out arguments are provided + if args.file and args.out: + # Extract the Dataflow part and save it to the output folder + extract_dataflow(args.file, args.out) + else: + print("Both --file and --out arguments are required.") From 0277b628bfc32a714fe24bfedae12b62e6a68723 Mon Sep 17 00:00:00 2001 From: LuigiAltamura Date: Wed, 8 May 2024 16:10:57 +0200 Subject: [PATCH 5/6] Add mixed precision to GAMMA --- build.py | 2 +- .../costmodel_cstr/maestro_cstr.py | 3 + data/model/vgg16.csv | 28 ++-- src/GAMMA/gamma.py | 101 +++++++++--- src/GAMMA/main.py | 36 ++--- src/GAMMA/train.py | 151 ++++++++++++++---- 6 files changed, 230 insertions(+), 91 deletions(-) diff --git a/build.py b/build.py index bb69cd7..efe6146 100644 --- a/build.py +++ b/build.py @@ -1,7 +1,7 @@ import os, sys commit_id = 'e1d8efd8e5469cf865a9db60007a70e3f0cb8778' dst_path = "cost_model/maestro" -maestro_dir = "../maestro" +maestro_dir = "../qmaestro" working_path = os.getcwd() dst_path = os.path.join(working_path, dst_path) maestro = os.path.join(maestro_dir, "maestro") diff --git a/data/mapping_cstr/advanced_cstr/costmodel_cstr/maestro_cstr.py b/data/mapping_cstr/advanced_cstr/costmodel_cstr/maestro_cstr.py index 14a0e21..ac3c67f 100644 --- a/data/mapping_cstr/advanced_cstr/costmodel_cstr/maestro_cstr.py +++ b/data/mapping_cstr/advanced_cstr/costmodel_cstr/maestro_cstr.py @@ -1,5 +1,8 @@ import numpy as np mapping_cstr = {} +mapping_cstr["L3"] = {"R":"R", + "S":"S", + } mapping_cstr["L2"] = {"R":"R", "S":"S", } diff --git a/data/model/vgg16.csv b/data/model/vgg16.csv index bd43515..c935806 100755 --- a/data/model/vgg16.csv +++ b/data/model/vgg16.csv @@ -1,14 +1,14 @@ -K,C,Y,X,R,S,T -64,3,224,224,3,3,1 -64,64,224,224,3,3,1 -128,64,112,112,3,3,1 -128,128,112,112,3,3,1 -256,128,56,56,3,3,1 -256,256,56,56,3,3,1 -256,256,56,56,3,3,1 -512,256,28,28,3,3,1 -512,512,28,28,3,3,1 -512,512,28,28,3,3,1 -512,512,14,14,3,3,1 -512,512,14,14,3,3,1 -512,512,14,14,3,3,1 +K,C,Y,X,R,S,T,Precision +64,3,224,224,3,3,1,FP16 +64,64,224,224,3,3,1,FP16 +128,64,112,112,3,3,1,FP16 +128,128,112,112,3,3,1,FP16 +256,128,56,56,3,3,1,FP16 +256,256,56,56,3,3,1,FP16 +256,256,56,56,3,3,1,FP16 +512,256,28,28,3,3,1,FP16 +512,512,28,28,3,3,1,FP16 +512,512,28,28,3,3,1,FP16 +512,512,14,14,3,3,1,FP16 +512,512,14,14,3,3,1,FP16 +512,512,14,14,3,3,1,FP16 diff --git a/src/GAMMA/gamma.py b/src/GAMMA/gamma.py index 566a3df..42d169b 100755 --- a/src/GAMMA/gamma.py +++ b/src/GAMMA/gamma.py @@ -1,3 +1,5 @@ +import functools + import numpy as np import copy, random import os @@ -60,7 +62,7 @@ def __init__(self,dimension, map_cstr=None, num_pe=64, pe_limit=1024, fitness="l self.area_pebuf_only=False self.external_area_model = False - def reset_hw_parm(self, l1_size=None, l2_size=None, num_pe=None, NocBW=None, map_cstr=None, pe_limit=None,area_pebuf_only=None, external_area_model=None, offchipBW=None): + def reset_hw_parm(self, l1_size=None, l2_size=None, num_pe=None, NocBW=None, map_cstr=None, pe_limit=None,area_pebuf_only=None, external_area_model=None, offchipBW=None, slevel_max=None, slevel_min=None): if l1_size: self.l1_size=l1_size if l1_size > 0 else 2**30 if l2_size: @@ -79,6 +81,12 @@ def reset_hw_parm(self, l1_size=None, l2_size=None, num_pe=None, NocBW=None, map self.area_pebuf_only = area_pebuf_only if external_area_model: self.external_area_model = external_area_model + if map_cstr: + self.map_cstr = map_cstr + if slevel_min: + self.slevel_min = slevel_min + if slevel_max: + self.slevel_max = slevel_max def get_dimension_factors(self, dimension_dict): dimension_factors = dict() @@ -548,7 +556,7 @@ def create_unit_base_pops(self, population, num_all_unit=None): for i in range(1, 7): population[idx][i + level * 7][1] = 1 - def reinit_pop(self,pool, num_population, stage_idx, best_sol_1st, init_pop, cur_gen=-1, bias= None, num_all_unit=2): + def reinit_pop(self,pool, num_population, stage_idx, best_sol_1st, init_pop, cur_gen=-1, bias= None, num_all_unit=2, precision=None): population = [self.create_genome_fixedSL(bias=bias) for _ in range(num_population)] #====always create a base unit pop======= self.create_unit_base_pops(population, num_all_unit=num_all_unit) @@ -565,7 +573,8 @@ def reinit_pop(self,pool, num_population, stage_idx, best_sol_1st, init_pop, cu self.num_parents = num_population self.comform_to_cstr(population) self.fitness = np.ones((max(num_population, len(population)), len(self.fitness_objective)), float) - self.evaluate(pool=pool, population=population,cur_gen=cur_gen) + + self.evaluate(pool=pool, population=population,cur_gen=cur_gen, precision=precision) return population @@ -640,14 +649,15 @@ def adjust_fitness(self, fitness): gen_best_idx = np.argmax(fitness[:,0]) return fitness, gen_best_idx - def evaluate(self, pool, population, cur_gen=-1): + def evaluate(self, pool, population, cur_gen=-1, precision=None): gen_best = -float("Inf") gen_best_activity = None gen_best_idx = 0 count_non_valid = 0 # populations = pool.map(self.thread_fun_correctify_tile_dependency, population) # population[:] = populations - reward_activ_list = pool.map(self.thread_fun, population) + partial_thread_fun = functools.partial(self.thread_fun, precision=precision) + reward_activ_list = pool.map(partial_thread_fun, population) for i in range(len(population)): reward, activity_count = reward_activ_list[i] @@ -721,17 +731,17 @@ def injection(self, inject_ratio=1.0): return pop_inj, inj_fitness def run(self, dimension, stage_idx=0, prev_stage_value=0, num_population=100, num_generations=100, elite_ratio=0.05, - parents_ratio=0.4, ratio_decay=1, num_finetune=1, best_sol_1st=None, init_pop=None, bias=None, uni_base=True, use_factor=False, use_pleteau=False, L1_bias_template=None): + parents_ratio=0.4, ratio_decay=1, num_finetune=1, best_sol_1st=None, init_pop=None, bias=None, uni_base=True, use_factor=False, use_pleteau=False, L1_bias_template=None, precision=None): self.init_arguement(dimension=dimension, stage_idx=stage_idx, prev_stage_value=prev_stage_value, num_population=num_population, num_generations=num_generations, elite_ratio=elite_ratio, parents_ratio=parents_ratio, ratio_decay=ratio_decay, num_finetune=num_finetune, best_sol_1st=best_sol_1st, init_pop=init_pop,uni_base=uni_base, use_factor=use_factor, use_pleteau=use_pleteau,L1_bias_template=L1_bias_template) pool = Pool(min(self.num_population + self.num_elite, cpu_count())) - population = self.reinit_pop(pool,self.num_population, self.stage_idx, self.best_sol_1st, self.init_pop, bias=bias) + population = self.reinit_pop(pool,self.num_population, self.stage_idx, self.best_sol_1st, self.init_pop, bias=bias, precision=precision) if self.map_cstr: self.cstr_list, self.num_free_order, self.num_free_par = self.map_cstr.get_cstr_list(copy.deepcopy(population[0]), fixed_sp_sz=self.fixedCluster) for g in range(num_generations): while self.num_parents < 1: # restart - population = self.reinit_pop(pool, self.num_population, self.stage_idx, self.best_sol_1st, self.init_pop, cur_gen=g) + population = self.reinit_pop(pool, self.num_population, self.stage_idx, self.best_sol_1st, self.init_pop, cur_gen=g, precision=precision) print("Reinitialize population") population, self.fitness, self.parents = self.select_parents(population, self.fitness, self.num_parents, self.num_population,) @@ -769,12 +779,12 @@ def run(self, dimension, stage_idx=0, prev_stage_value=0, num_population=100, nu # population = elite + population + pop_inj self.fitness = np.concatenate((self.elite_fitness, self.fitness)) # self.fitness = np.concatenate((self.elite_fitness, self.fitness, inj_fitness)) - chkpt = self.evaluate(pool=pool, population=population, cur_gen=g) + chkpt = self.evaluate(pool=pool, population=population, cur_gen=g, precision=precision) # self.check_tile_dependency(population) if self.log_level>1: if chkpt["best_sol"] is not None and self.log_level>1: - best_runtime, best_throughput, best_energy, best_area, best_l1_size, best_l2_size, best_mac, best_power, best_num_pe = self.get_indiv_info( chkpt["best_sol"]) + best_runtime, best_throughput, best_energy, best_area, best_l1_size, best_l2_size, best_mac, best_power, best_num_pe = self.get_indiv_info( chkpt["best_sol"], precision=precision) # best_num_pe = chkpt["best_sol"][0][1] if self.num_pe<1 else self.num_pe # print(f"Runtime: {best_runtime}, L1: {best_l1_size}, L2: {best_l2_size}, L1_usage:{best_l1_size/self.l1_size:}, L2_usage:{best_l2_size/self.l2_size:.4f}, PE: {best_num_pe}") print(f"Gen {g+1}: Reward: {chkpt['best_reward'][0]:.3e}, Runtime: {best_runtime}, Area: {best_area/1e6:.3f}mm2, PE Area_ratio: {best_num_pe*MAC_AREA_INT8/best_area*100:.1f}%, L1: {best_l1_size}, L2: {best_l2_size}, PE: {best_num_pe}") @@ -820,12 +830,12 @@ def sort_population(self, population): def thread_fun_correctify_tile_dependency(self, indv): return self.correctify_tile_dependency_thread(indv) - def thread_fun(self, individual): - reward, activity_count = self.oberserve_maestro(individual) + def thread_fun(self, individual, precision=None): + reward, activity_count = self.oberserve_maestro(individual, precision=precision) return [reward, activity_count] - def get_indiv_info(self, individual, num_pe=None, l1_size=None, l2_size=None, NocBW=None): - self.oberserve_maestro(individual,num_pe=num_pe, l1_size=l1_size, l2_size=l2_size, NocBW=NocBW) + def get_indiv_info(self, individual, num_pe=None, l1_size=None, l2_size=None, NocBW=None, precision=None): + self.oberserve_maestro(individual,num_pe=num_pe, l1_size=l1_size, l2_size=l2_size, NocBW=NocBW, precision=precision) return self.observation def get_CONVtypeShape(self, dimensions, CONVtype=1): @@ -842,7 +852,7 @@ def get_CONVtypeShape(self, dimensions, CONVtype=1): print("Not supported layer.") return dimensions - def write_maestro(self, indv, layer_id=0, m_file = None, folder_path = None): + def write_maestro(self, indv, layer_id=0, m_file = None, folder_path = None, precision=None): dimensions = [self.dimension] if layer_id != 0: m_file_with_layer = "{}_{}".format(m_file, layer_id) @@ -861,6 +871,8 @@ def write_maestro(self, indv, layer_id=0, m_file = None, folder_path = None): fo.write( "Dimensions {{ K: {:.0f}, C: {:.0f}, Y: {:.0f}, X: {:.0f}, R: {:.0f}, S: {:.0f} }}\n".format( *dimension)) + if precision is not None: + fo.write("Precision: {{ {} }}\n".format(precision)) fo.write("Dataflow {\n") for k in range(0, len(indv), 7): for i in range(k, k + 7): @@ -888,16 +900,16 @@ def write_maestro(self, indv, layer_id=0, m_file = None, folder_path = None): fo.write("}\n") fo.write("}") - def oberserve_maestro(self, indv, num_pe=None, l1_size=None, l2_size=None, NocBW=None, offchipBW=None): + def oberserve_maestro(self, indv, num_pe=None, l1_size=None, l2_size=None, NocBW=None, offchipBW=None, precision=None): m_file = "{}".format(random.randint(0, 2**32)) - self.write_maestro(indv,m_file=m_file) + self.write_maestro(indv, m_file=m_file, precision=precision) if num_pe: - to_use_num_pe = num_pe + to_use_num_pe = self.num_pe_to_use(num_pe, precision) elif self.num_pe <1: - to_use_num_pe = indv[0][1] + to_use_num_pe = self.num_pe_to_use(indv[0][1], precision) else: - to_use_num_pe = self.num_pe + to_use_num_pe = self.num_pe_to_use(self.num_pe, precision) # print(num_pe, bw, l1_size) os.remove("./{}.csv".format(m_file)) if os.path.exists("./{}.csv".format(m_file)) else None command = [self._executable, @@ -908,6 +920,8 @@ def oberserve_maestro(self, indv, num_pe=None, l1_size=None, l2_size=None, NocBW "--noc_mc_support=true", "--num_pes={}".format(int(to_use_num_pe)), "--num_simd_lanes=1", "--l1_size_cstr={}".format(self.l1_size if not l1_size else l1_size), "--l2_size_cstr={}".format(self.l2_size if not l2_size else l2_size), "--print_res=false", "--print_res_csv_file=true", "--print_log_file=false", "--print_design_space=false", "--msg_print_lv=0"] +# "--num_simd_lanes=1", "--l1_size_cstr={}".format(int(self.l1_to_use(self.l1_size, precision)) if not l1_size else + # int(self.l1_to_use(l1_size, precision))), process = Popen(command, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() @@ -960,7 +974,8 @@ def oberserve_maestro(self, indv, num_pe=None, l1_size=None, l2_size=None, NocBW elif self.area_pebuf_only: area = self.compute_area_maestro(to_use_num_pe, l1_size, l2_size) - self.observation = [np.mean(x) for x in [runtime, throughput, energy, area, l1_size, l2_size, mac, power, to_use_num_pe]] + self.observation = [np.mean(x) for x in [runtime, throughput, energy, area, l1_size, l2_size, mac, power, + self.restore_num_pe(to_use_num_pe, precision)]] def catch_exception(): if l1_size>self.l1_size or l2_size>self.l2_size or any(runtime_series<1) or any(l1_size_series<1) or any(l2_size_series<1): return True @@ -977,6 +992,50 @@ def catch_exception(): except: return None, None + def num_pe_to_use(self, num_pe, precision): + + if precision is None or precision == "FP32": + return num_pe + if precision == "FP16": + return num_pe / 2 + if precision == "FP8": + return num_pe / 4 + if precision == "INT32": + return num_pe + if precision == "INT16": + return num_pe / 2 + if precision == "INT8": + return num_pe / 4 + + def restore_num_pe(self, num_pe, precision): + + if precision is None or precision == "FP32": + return num_pe + if precision == "FP16": + return num_pe * 2 + if precision == "FP8": + return num_pe * 4 + if precision == "INT32": + return num_pe + if precision == "INT16": + return num_pe * 2 + if precision == "INT8": + return num_pe * 4 + + def l1_to_use(self, l1_size, precision): + + if precision is None or precision == "FP32": + return l1_size + if precision == "FP16": + return l1_size * 2 + if precision == "FP8": + return l1_size * 4 + if precision == "INT32": + return l1_size + if precision == "INT16": + return l1_size * 2 + if precision == "INT8": + return l1_size * 4 def impose_halloffame(self, observe_value, target="latency_ave" ): is_violated = False if self.stat is not None: diff --git a/src/GAMMA/main.py b/src/GAMMA/main.py index bd7f0ea..2d180cb 100644 --- a/src/GAMMA/main.py +++ b/src/GAMMA/main.py @@ -17,8 +17,8 @@ parser.add_argument('--model', type=str, default="resnet18", help='Model to run') parser.add_argument('--num_layer', type=int, default=0, help='Number of layers to optimize') parser.add_argument('--singlelayer', type=int, default=0, help='The layer index to optimize') - parser.add_argument('--slevel_min', type=int, default=2, help='Minimum number of parallelization level') - parser.add_argument('--slevel_max', type=int, default=2, help='Maximum number of parallelization level') + parser.add_argument('--slevel_min', type=int, default=3, help='Minimum number of parallelization level') + parser.add_argument('--slevel_max', type=int, default=3, help='Maximum number of parallelization level') parser.add_argument('--fixedCluster', type=int, default=0, help='Rigid cluster size') parser.add_argument('--log_level', type=int, default=1, help='Detail: 2, runtimeinfo: 1') parser.add_argument('--costmodel_cstr', type=str, default='maestro_cstr', help='Constraint from Cost model') @@ -41,7 +41,17 @@ m_file_path = "../../data/model/" m_file = os.path.join(m_file_path, opt.model + ".csv") df = pd.read_csv(m_file) - model_defs = df.to_numpy() + model = df.to_numpy() + model_defs = [] + precision_array = [] + + for sublist in model: + string_indices = [i for i, elem in enumerate(sublist) if isinstance(elem, str)] + precision_array.extend(elem for elem in sublist if isinstance(elem, str)) + sublist = np.delete(sublist, string_indices) + model_defs.append(sublist.tolist()) + model_defs = np.array(model_defs) + if opt.singlelayer: model_defs=model_defs[opt.singlelayer-1:opt.singlelayer] else: @@ -63,30 +73,12 @@ os.makedirs(outdir_exp, exist_ok=True) chkpt_file_t = "{}".format("result") chkpt_file = os.path.join(outdir_exp, chkpt_file_t + "_c.plt") - map_cstr = None - if opt.accel_cstr: - accel_file = importlib.import_module(f'data.mapping_cstr.advanced_cstr.accel_cstr.{opt.accel_cstr}') - accelator_cstr = accel_file.accel_cstr - map_cstr = Constraint(num_pe=opt.num_pe) - translate_to_actual_cstr(accelator_cstr, map_cstr) - - if opt.mapping_cstr: - mapping_file = importlib.import_module(f'data.mapping_cstr.{opt.mapping_cstr}') - mapping_cstr = mapping_file.mapping_cstr - map_cstr = Constraint(num_pe=opt.num_pe) if not map_cstr else map_cstr - put_into_actual_cstr(mapping_cstr, map_cstr) - - if opt.costmodel_cstr: - mapping_file = importlib.import_module(f'data.mapping_cstr.advanced_cstr.costmodel_cstr.{opt.costmodel_cstr}') - costmodel_cstr = mapping_file.mapping_cstr - map_cstr = Constraint(num_pe=opt.num_pe) if not map_cstr else map_cstr - put_into_actual_cstr(costmodel_cstr, map_cstr) if check_tpu(opt.accel_cstr, opt.mapping_cstr): model_defs = translate_to_gemm(model_defs) try: - train_model(model_defs, input_arg=opt, map_cstr=map_cstr, chkpt_file=chkpt_file) + train_model(model_defs, input_arg=opt, chkpt_file=chkpt_file, precisions=precision_array) finally: for f in glob.glob("*.m"): diff --git a/src/GAMMA/train.py b/src/GAMMA/train.py index d89bf19..bf901e2 100644 --- a/src/GAMMA/train.py +++ b/src/GAMMA/train.py @@ -1,29 +1,28 @@ - import copy import argparse from datetime import datetime import glob import os, sys + script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '../')) project_path = os.path.abspath(os.path.join(script_dir, '../../')) if module_path not in sys.path: - sys.path.insert(0,module_path) + sys.path.insert(0, module_path) if project_path not in sys.path: - sys.path.insert(0,project_path) + sys.path.insert(0, project_path) from utils import * import gamma as gamma -from math import ceil import importlib -from shutil import copyfile + fitness_list = None fitness = None stage_idx = 0 prev_stage_value = [] tune_iter = 1 opt = None -MAC_AREA_MAESTRO=4470 +MAC_AREA_MAESTRO = 4470 MAC_AREA_INT8 = 282 BUF_AREA_perbit = 0.086 L2BUF_AREA_MAESTRO = 4161.536 @@ -31,58 +30,81 @@ L2BUF_UNIT = 32768 L1BUF_UNIT = 64 +# bias = {"par": {1: "K", 2:"C"}, "order":{1:["K", "C"]}, "tiles": {1:{"K":0.1, "C":0.2}, 2:{"K":0.3}}} +bias = {"par": {1: "K", 2: "C"}, "order": {1: ["K", "C", "Y", "X"], 2: ["K", "C", "Y", "X"]}} -# bias = {"par": {1: "K", 2:"C"}, "order":{1:["K", "C"]}, "tiles": {1:{"K":0.1, "C":0.2}, 2:{"K":0.3}}} -bias = {"par": {1: "K", 2:"C"}, "order":{1:["K", "C","Y", "X"], 2:["K", "C","Y", "X"]}} # bias = {"par": {1: "K", 2:"C"}} # bias = {"par": {1: "Y"}} -def get_pe_usage(env, sol, num_pe ): +def get_pe_usage(env, sol, num_pe): util_num_pe = num_pe - baseline = env.get_indiv_info( sol, num_pe=num_pe) + baseline = env.get_indiv_info(sol, num_pe=num_pe) best_runtime, best_throughput, best_energy, best_area, best_l1_size, best_l2_size, best_mac, best_power, best_num_pe = baseline baseline = np.array(baseline)[:-2] - for i in range(num_pe-1): + for i in range(num_pe - 1): util_num_pe -= 1 cur = env.get_indiv_info(sol, num_pe=util_num_pe) best_runtime, best_throughput, best_energy, best_area, best_l1_size, best_l2_size, best_mac, best_power, best_num_pe = cur cur = np.array(cur)[:-2] - if sum(baseline!=cur)>1: + if sum(baseline != cur) > 1: util_num_pe += 1 break return util_num_pe -def train_model(model_defs, input_arg, map_cstr=None, chkpt_file='./chkpt'): + +def train_model(model_defs, input_arg, chkpt_file='./chkpt', precisions=None): global opt opt = input_arg - fitness = [ opt.fitness1, opt.fitness2] + fitness = [opt.fitness1, opt.fitness2] dimension = model_defs[0] + map_cstr = None env = gamma.GAMMA(dimension=dimension, num_pe=opt.num_pe, fitness=fitness, par_RS=opt.parRS, l1_size=opt.l1_size, - l2_size=opt.l2_size, NocBW=opt.NocBW, offchipBW=opt.offchipBW, slevel_min=opt.slevel_min, slevel_max=opt.slevel_max, + l2_size=opt.l2_size, NocBW=opt.NocBW, offchipBW=opt.offchipBW, slevel_min=opt.slevel_min, + slevel_max=opt.slevel_max, fixedCluster=opt.fixedCluster, log_level=opt.log_level, map_cstr=map_cstr) - constraints = {"area":opt.area_budget* 1e6} + constraints = {"area": opt.area_budget * 1e6} chkpt_list = [] num_layer = 1 + for dimension in model_defs: + + # ridefinire i valori di PE e l1 in base al tipo di quantizzazione + if len(precisions): + precision = precisions[num_layer - 1] + else: + precision = None + + map_cstr = map_constraints(map_cstr, opt, precision) + env.reset_dimension(fitness=fitness, constraints=constraints, dimension=dimension) - env.reset_hw_parm(num_pe=opt.num_pe, l1_size=opt.l1_size, l2_size=opt.l2_size, pe_limit=opt.pe_limit,area_pebuf_only=False, external_area_model=True) + env.reset_hw_parm(num_pe=get_value_for_pe(precision, opt.num_pe), + l1_size=opt.l1_size, + l2_size=opt.l2_size, pe_limit=opt.pe_limit, + area_pebuf_only=False, external_area_model=True, map_cstr=map_cstr, + slevel_max=get_value_for_precision(precision), slevel_min=get_value_for_precision(precision)) + #tolto calcolo su l1 value da verificare se rimettere chkpt, pops = env.run(dimension, stage_idx=0, num_population=opt.num_pop, prev_stage_value=None, - num_generations=opt.epochs, - best_sol_1st=None, init_pop=None, bias=None, uni_base=True, use_factor=opt.use_factor, use_pleteau=False) + num_generations=opt.epochs, + best_sol_1st=None, init_pop=None, bias=None, uni_base=True, use_factor=opt.use_factor, + use_pleteau=False, precision=precision) best_sol = chkpt["best_sol"] - best_runtime, best_throughput, best_energy, best_area, best_l1_size, best_l2_size, best_mac, best_power, best_num_pe = env.get_indiv_info(best_sol, num_pe=None) + best_runtime, best_throughput, best_energy, best_area, best_l1_size, best_l2_size, best_mac, best_power, best_num_pe = env.get_indiv_info( + best_sol, num_pe=None, precision=precision) print("Mapping:", chkpt["best_sol"]) - print(f"Reward: {chkpt['best_reward'][0]:.3e}, Runtime: {best_runtime:.0f}(cycles), Area: {best_area/1e6:.3f}(mm2), PE Area_ratio: {best_num_pe*MAC_AREA_INT8/best_area*100:.1f}%, Num_PE: {best_num_pe:.0f}, L1 Buffer: {best_l1_size:.0f}(elements), L2 Buffer: {best_l2_size:.0f}(elements)") + print( + f"Reward: {chkpt['best_reward'][0]:.3e}, Runtime: {best_runtime:.0f}(cycles), Area: {best_area / 1e6:.3f}(mm2), PE Area_ratio: {best_num_pe * MAC_AREA_INT8 / best_area * 100:.1f}%, Num_PE: {best_num_pe:.0f}, L1 Buffer: {best_l1_size:.0f}(elements), L2 Buffer: {best_l2_size:.0f}(elements)") chkpt = { - "reward":chkpt['best_reward'][0], - "best_sol":best_sol, - "runtime":best_runtime, - "area":best_area, - "pe_area_ratio":best_num_pe*MAC_AREA_INT8/best_area, - "PE":best_num_pe, + "reward": chkpt['best_reward'][0], + "Best_solution": best_sol, + "Runtime": best_runtime, + "Throughput (MACs/Cycle)": best_throughput, + "Activity count-based Energy (nJ)": best_energy, + "Area": best_area, + "PE_Area_Ratio": best_num_pe * MAC_AREA_INT8 / best_area, + "PE": best_num_pe, "PE_area": best_num_pe * MAC_AREA_INT8, "L1_area": best_l1_size * best_num_pe * BUF_AREA_perbit * 8, "L2_area": best_l2_size * BUF_AREA_perbit * 8, @@ -91,25 +113,30 @@ def train_model(model_defs, input_arg, map_cstr=None, chkpt_file='./chkpt'): } chkpt_list.append(chkpt) if opt.num_layer != 0: - env.write_maestro(best_sol, m_file = opt.model, layer_id = num_layer, folder_path=os.path.dirname(chkpt_file)) + env.write_maestro(best_sol, m_file=opt.model, layer_id=num_layer, folder_path=os.path.dirname(chkpt_file), + precision=precision) else: - env.write_maestro(best_sol, m_file = opt.model, layer_id = opt.singlelayer, folder_path=os.path.dirname(chkpt_file)) + env.write_maestro(best_sol, m_file=opt.model, layer_id=opt.singlelayer, + folder_path=os.path.dirname(chkpt_file), precision=precision) num_layer += 1 - columns = ["runtime", "area", "pe_area_ratio", "PE", "L1_size", "L2_size", "PE_area", "L1_area", "L2_area","best_sol"] + columns = ["Runtime", "Throughput (MACs/Cycle)", "Activity count-based Energy (nJ)", "Area", "PE_Area_Ratio", "PE", "L1_size", "L2_size", "PE_area", "L1_area", "L2_area", + "Best_solution"] np_array = None for chkpt in chkpt_list: if np_array is None: - np_array = np.array([chkpt[t] for t in columns[:-1]] + [f'{chkpt["best_sol"]}']).reshape(1, -1) + np_array = np.array([chkpt[t] for t in columns[:-1]] + [f'{chkpt["Best_solution"]}']).reshape(1, -1) else: - np_array = np.vstack([np_array, np.array([chkpt[t] for t in columns[:-1]] + [f'{chkpt["best_sol"]}']).reshape(1, -1)]) + np_array = np.vstack( + [np_array, np.array([chkpt[t] for t in columns[:-1]] + [f'{chkpt["Best_solution"]}']).reshape(1, -1)]) df = pd.DataFrame(np_array, columns=columns) - df.to_csv(chkpt_file[:-4]+".csv") + df.to_csv(chkpt_file[:-4] + ".csv", index_label="Layer") with open(chkpt_file, "wb") as fd: pickle.dump(chkpt_list, fd) + def get_cstr_name(mapping_cstr): if mapping_cstr: cstr_name = mapping_cstr @@ -118,3 +145,61 @@ def get_cstr_name(mapping_cstr): return cstr_name +def get_value_for_precision(precision): + if precision is None or precision == "FP32": + return 2 + else: + return 3 + + +def get_value_for_pe(precision, num_pe): + if precision is None or precision == "FP32": + return num_pe + if precision == "FP16": + return int(num_pe * 2) + if precision == "FP8": + return int(num_pe * 4) + if precision == "INT32": + return int(num_pe) + if precision == "INT16": + return int(num_pe * 2) + if precision == "INT8": + return int(num_pe * 4) + + +def get_value_for_l1(precision, l1_size): + if precision is None or precision == "FP32": + return l1_size + if precision == "FP16": + return int(l1_size / 2) + if precision == "FP8": + return int(l1_size / 4) + if precision == "INT32": + return l1_size + if precision == "INT16": + return int(l1_size / 2) + if precision == "INT8": + return int(l1_size / 4) + + +def map_constraints(map_cstr, opt, precision): + if opt.accel_cstr: + accel_file = importlib.import_module(f'data.mapping_cstr.advanced_cstr.accel_cstr.{opt.accel_cstr}') + accelator_cstr = accel_file.accel_cstr + map_cstr = Constraint(num_pe=get_value_for_pe(precision, opt.num_pe)) + translate_to_actual_cstr(accelator_cstr, map_cstr) + + if opt.mapping_cstr: + mapping_file = importlib.import_module(f'data.mapping_cstr.{opt.mapping_cstr}') + mapping_cstr = mapping_file.mapping_cstr + map_cstr = Constraint(num_pe=get_value_for_pe(precision, opt.num_pe)) if not map_cstr else map_cstr + put_into_actual_cstr(mapping_cstr, map_cstr) + + if opt.costmodel_cstr: + mapping_file = importlib.import_module( + f'data.mapping_cstr.advanced_cstr.costmodel_cstr.{opt.costmodel_cstr}') + costmodel_cstr = mapping_file.mapping_cstr + map_cstr = Constraint(num_pe=get_value_for_pe(precision, opt.num_pe)) if not map_cstr else map_cstr + put_into_actual_cstr(costmodel_cstr, map_cstr) + + return map_cstr From 3687a13646d24419b7b0c62d1c89d89a6e795885 Mon Sep 17 00:00:00 2001 From: LuigiAltamura Date: Sun, 30 Jun 2024 17:52:41 +0200 Subject: [PATCH 6/6] Add mixed precision to GAMMA --- extract_dataflow.py | 67 +-- from_maestro_to_gamma.py | 49 ++ src/GAMMA/cost_database.py | 105 +++++ src/GAMMA/gamma.py | 586 +++++++++++++++--------- src/GAMMA/main.py | 4 +- src/GAMMA/train.py | 94 +++- src/Other_Blackbox_optimization/main.py | 4 +- src/utils/constraint.py | 65 +-- 8 files changed, 686 insertions(+), 288 deletions(-) create mode 100644 from_maestro_to_gamma.py create mode 100644 src/GAMMA/cost_database.py diff --git a/extract_dataflow.py b/extract_dataflow.py index 399ef2a..76338c1 100644 --- a/extract_dataflow.py +++ b/extract_dataflow.py @@ -1,52 +1,61 @@ import os import argparse -def extract_dataflow(file_name, output_folder): +def extract_sections(file_name, output_folder, output_file_name=None): # Read the content of the original file with open(file_name, 'r') as f: content = f.read() - # Find the starting and ending indices of the Dataflow section - start_index = content.find("Dataflow {") - end_index = content.find("}", start_index) + 1 - - # Extract the Dataflow section - dataflow_section = content[start_index:end_index] - - # Add a new line after the closing parenthesis - dataflow_section += '\n' - - # Add a tab of shift to each line inside the Dataflow section, excluding the first and last lines - lines = dataflow_section.split('\n') - indented_lines = '\n'.join(lines[0:1] + ['\t' + line for line in lines[1:-2]] + lines[-2:]) + # Initialize sections + precision_section = "" + dataflow_section = "" + + # Find the Precision section, if it exists + precision_start_index = content.find("Precision:") + if precision_start_index != -1: + precision_end_index = content.find("}", precision_start_index) + 1 + precision_section = content[precision_start_index:precision_end_index] + precision_section += '\n' + + # Find the Dataflow section + dataflow_start_index = content.find("Dataflow {") + if dataflow_start_index != -1: + dataflow_end_index = content.find("}", dataflow_start_index) + 1 + dataflow_section = content[dataflow_start_index:dataflow_end_index] + dataflow_section += '\n' + + # Indent lines in Dataflow section (excluding the first and last lines) + dataflow_lines = dataflow_section.split('\n') + indented_dataflow_lines = '\n'.join(dataflow_lines[0:1] + ['\t' + line for line in dataflow_lines[1:-1]] + dataflow_lines[-1:]) + dataflow_section = indented_dataflow_lines # Create the output folder if it doesn't exist if not os.path.exists(output_folder): os.makedirs(output_folder) - # Construct the output file path without the "_dataflow" suffix - output_file_name = os.path.splitext(os.path.basename(file_name))[0].replace('_dataflow', '') - output_file_path = os.path.join(output_folder, output_file_name + '.m') + # Determine the output file name + if output_file_name is None: + output_file_name = os.path.basename(file_name) + output_file_path = os.path.join(output_folder, output_file_name) - # Write the modified Dataflow content to the new file + # Write the modified Precision and Dataflow sections to the new file with open(output_file_path, 'w') as f: - # Write the modified Dataflow section - f.write(indented_lines) + if precision_section: + f.write(precision_section) + if dataflow_section: + f.write(dataflow_section) if __name__ == "__main__": # Create an argument parser - parser = argparse.ArgumentParser(description='Extract Dataflow from a file and save to a new file.') + parser = argparse.ArgumentParser(description='Extract Precision and Dataflow from a file and save to a new file.') # Add the file and output folder arguments - parser.add_argument('--file', type=str, help='Path of the input file') - parser.add_argument('--out', type=str, help='Path of the output folder') + parser.add_argument('--file', type=str, required=True, help='Path of the input file') + parser.add_argument('--out', type=str, required=True, help='Path of the output folder') + parser.add_argument('--outname', type=str, help='Optional name of the output file') # Parse the command-line arguments args = parser.parse_args() - # Check if both --file and --out arguments are provided - if args.file and args.out: - # Extract the Dataflow part and save it to the output folder - extract_dataflow(args.file, args.out) - else: - print("Both --file and --out arguments are required.") + # Extract the Precision and Dataflow parts and save them to the output folder + extract_sections(args.file, args.out, args.outname) diff --git a/from_maestro_to_gamma.py b/from_maestro_to_gamma.py new file mode 100644 index 0000000..f0d953e --- /dev/null +++ b/from_maestro_to_gamma.py @@ -0,0 +1,49 @@ +import csv +import re +import argparse + +def parse_m_file(file_path): + layers = [] + with open('../qmaestro/data/model/' + file_path, 'r') as file: + with open('../qmaestro/data/model/' + file_path, 'r') as file: + content = file.read() + layers_type = re.findall(r'Type: (\w+)', content, re.DOTALL) + dimension_matches = re.findall(r'Dimensions \{.*?\}', content, re.DOTALL) + + for i, dimensions in enumerate(dimension_matches): + dim_values = re.findall(r'\b\w: \d+', dimensions) + dim_dict = {d.split(': ')[0]: int(d.split(': ')[1]) for d in dim_values} + if layers_type[i] == 'DSCONV': + dim_dict['T'] = 2 + else: + dim_dict['T'] = 1 + layers.append(dim_dict) + return layers + +def write_csv(layers, precision, output_file): + with open('./data/model/'+ output_file, mode='w', newline='') as file: + writer = csv.writer(file) + writer.writerow(["K", "C", "Y", "X", "R", "S", "T", "Precision"]) + for layer in layers: + t_value = 2 if layer.get('Type') == 'DSCONV' else 1 + writer.writerow([ + layer.get('K', 0), + layer.get('C', 0), + layer.get('Y', 0), + layer.get('X', 0), + layer.get('R', 0), + layer.get('S', 0), + layer.get('T', 0), + precision + ]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Parse a .m file and generate a CSV.') + parser.add_argument('input_file', type=str, help='Path to the input .m file') + parser.add_argument('precision', type=str, help='Precision value for the CSV') + parser.add_argument('output_file', type=str, help='Path to the output CSV file') + args = parser.parse_args() + + layers = parse_m_file(args.input_file) + write_csv(layers, args.precision, args.output_file) diff --git a/src/GAMMA/cost_database.py b/src/GAMMA/cost_database.py new file mode 100644 index 0000000..d06a66f --- /dev/null +++ b/src/GAMMA/cost_database.py @@ -0,0 +1,105 @@ +# value expressed in nJ/op +energy_data = { + 'FLOAT': { + 'MULT': { + '7nm': {'FP32': 1.31e-3, 'FP16': 0.34e-3}, + '45nm': {'FP32': 3.7e-3, 'FP16': 1.1e-3, 'FP8': 0.327027027e-3, 'FP4': 0.097224251e-3}, + '22nm': {'FP32': 2.368e-3, 'FP16': 0.704e-3, 'FP8': 0.209297297e-3, 'FP4': 0.062223521e-3} + }, + 'ADD': { + '7nm': {'FP32': 0.38e-3, 'FP16': 0.16e-3}, + '45nm': {'FP32': 0.9e-3, 'FP16': 0.4e-3, 'FP8': 0.2e-3, 'FP4': 0.1e-3}, + '22nm': {'FP32': 0.576e-3, 'FP16': 0.256e-3, 'FP8': 0.128e-3, 'FP4': 0.064e-3} + }, + 'MAC': { + '7nm': {}, + '45nm': {'FP32': 16.8e-3, 'FP16': 7.85e-3, 'FP8': 3.802027027e-3, 'FP4': 2.183474251e-3}, + '22nm': {'FP32': 10.752e-3, 'FP16': 5.024e-3, 'FP8': 2.433297297e-3, 'FP4': 1.397423521e-3} + } + }, + 'INT': { + 'MULT': { + '7nm': {'INT32': 1.48e-3, 'INT8': 0.07e-3}, + '45nm': {'INT32': 3.1e-3, 'INT16': 0.8e-3, 'INT8': 0.2e-3, 'INT4': 0.05e-3, 'INT2': 0.003225806e-3}, + '22nm': {'INT32': 1.984e-3, 'INT8': 0.128e-3} + }, + 'ADD': { + '7nm': {'INT32': 0.03e-3, 'INT8': 0.007e-3}, + '45nm': {'INT32': 0.1e-3, 'INT16': 0.06e-3, 'INT8': 0.03e-3, 'INT4': 0.015e-3, 'INT2': 0.0075e-3}, + '22nm (ext)': {'INT32': 0.064e-3, 'INT16': 0.0384e-3, 'INT8': 0.0192e-3, 'INT4': 0.0096e-3, 'INT2': 0.0048e-3} + }, + 'MAC': { + '7nm': {}, + '45nm': {'INT32': 3.470506757e-3, 'INT16': 2.030506757e-3, 'INT8': 1.290506757e-3, 'INT4': 0.950506757e-3, + 'INT2': 0.66211966e-3}, + '22nm': {'INT32': 2.221124324e-3, 'INT16': 1.299524324e-3, 'INT8': 0.825924324e-3, 'INT4': 0.608324324e-3, + 'INT2': 0.423756582e-3} + } + } +} + +# Read and Write values expressed in nJ +sram_data = { + 512: {'Write': 2.73e-13, 'Read': 2.91e-13, 'Leakage Power (mW)': 5.81e-05}, + 1024: {'Write': 3.35e-13, 'Read': 4.18e-13, 'Leakage Power (mW)': 0.000130558}, + 2048: {'Write': 5.06e-13, 'Read': 5.89e-13, 'Leakage Power (mW)': 0.000300498}, + 4096: {'Write': 9.51e-13, 'Read': 9.73e-13, 'Leakage Power (mW)': 0.000518707}, + 8192: {'Write': 1.51e-12, 'Read': 1.53e-12, 'Leakage Power (mW)': 0.00101401}, + 16384: {'Write': 2.17e-12, 'Read': 2.24e-12, 'Leakage Power (mW)': 0.00210329}, + 32768: {'Write': 3.39e-12, 'Read': 3.45e-12, 'Leakage Power (mW)': 0.00400746}, + 65536: {'Write': 5.19e-12, 'Read': 5.30e-12, 'Leakage Power (mW)': 0.00865728}, + 131072: {'Write': 7.94e-12, 'Read': 8.06e-12, 'Leakage Power (mW)': 0.0169947}, + 262144: {'Write': 1.08e-11, 'Read': 1.09e-11, 'Leakage Power (mW)': 0.0326368}, + 524288: {'Write': 1.62e-11, 'Read': 1.63e-11, 'Leakage Power (mW)': 0.0642241}, + 1048576: {'Write': 2.22e-11, 'Read': 2.23e-11, 'Leakage Power (mW)': 0.125151}, + 2097152: {'Write': 3.33e-11, 'Read': 3.34e-11, 'Leakage Power (mW)': 0.247646}, + 4194304: {'Write': 4.59e-11, 'Read': 4.60e-11, 'Leakage Power (mW)': 0.487825}, + 8388608: {'Write': 9.18e-11, 'Read': 9.2e-11, 'Leakage Power (mW)': 0.97565}, + 16777216: {'Write': 1.836e-10, 'Read': 1.84e-10, 'Leakage Power (mW)': 1.9513}, + 33554432: {'Write': 3.672e-10, 'Read': 3.68e-10, 'Leakage Power (mW)': 3.9026}, + 67108864: {'Write': 7.344e-10, 'Read': 7.36e-10, 'Leakage Power (mW)': 7.8052}, + 134217728: {'Write': 1.4688e-9, 'Read': 1.472e-9, 'Leakage Power (mW)': 15.6104}, + 268435456: {'Write': 2.937e-9, 'Read': 2.944e-9, 'Leakage Power (mW)': 31.2208}, + 536870912: {'Write': 5.8752e-9, 'Read': 5.88e-9, 'Leakage Power (mW)': 62.4416}, + 1073741824: {'Write': 1.175e-8, 'Read': 1.176e-8, 'Leakage Power (mW)': 124.8832}, + 2147483648: {'Write': 2.35e-8, 'Read': 2.352e-8, 'Leakage Power (mW)': 249.7664} +} + +noc_dyn_energy_per_bit = 0.1143e-12 # J/bit/hop + +precision_to_bits = { + 'FP32': 32, + 'FP16': 16, + 'FP8': 8, + 'FP4': 4, + 'INT32': 32, + 'INT16': 16, + 'INT8': 8, + 'INT4': 4, + 'INT2': 2 +} + + +def get_sram_data(size, parameter): + try: + return sram_data[size][parameter] + except KeyError: + return None + + +def get_energy(operation, tech='22nm', precision=None): + try: + datatype = 'FLOAT' if precision.startswith('FP') else 'INT' + return energy_data[datatype][operation][tech][precision] + except KeyError: + return None + + +def calculate_noc_dyn_energy(precision, bw, hops=2): + if precision in precision_to_bits: + bits = precision_to_bits[precision] + # return a value expressed in nJ as for the other values considered + return noc_dyn_energy_per_bit * bits * float(bw) * hops * 1e9 + else: + raise ValueError(f"Unknown precision: {precision}") + diff --git a/src/GAMMA/gamma.py b/src/GAMMA/gamma.py index 42d169b..a8a908d 100755 --- a/src/GAMMA/gamma.py +++ b/src/GAMMA/gamma.py @@ -10,18 +10,25 @@ from functools import reduce from collections import defaultdict from math import ceil -m_type_dicts = {0:"CONV", 1:"CONV", 2:"DSCONV", 3:"CONV"} -CONVtype_dicts = {0:"FC", 1:"CONV",2:"DSCONV", 3:"GEMM"} -MAC_AREA_MAESTRO=4470 -MAC_AREA_INT8=282 -DEVELOP_MODE=False + +m_type_dicts = {0: "CONV", 1: "CONV", 2: "DSCONV", 3: "CONV"} +CONVtype_dicts = {0: "FC", 1: "CONV", 2: "DSCONV", 3: "GEMM"} +MAC_AREA_MAESTRO = 4470 +MAC_AREA_INT8 = 282 +DEVELOP_MODE = False + class GAMMA(object): - def __init__(self,dimension, map_cstr=None, num_pe=64, pe_limit=1024, fitness="latency", constraints=dict(), par_RS=False, l1_size=512, l2_size=108000, NocBW=81920000, offchipBW=81920000, slevel_min=2,slevel_max=2, fixedCluster=0, log_level=2,constraint_class=None,external_mem_cstr=None, use_factor=False,uni_base=True): - super(GAMMA,self).__init__() + def __init__(self, dimension, map_cstr=None, num_pe=64, pe_limit=1024, fitness="latency", constraints=dict(), + par_RS=False, l1_size=512, l2_size=108000, NocBW=81920000, offchipBW=81920000, slevel_min=2, + slevel_max=2, fixedCluster=0, log_level=2, constraint_class=None, external_mem_cstr=None, + use_factor=False, uni_base=True): + super(GAMMA, self).__init__() self.dimension = dimension - self.dimension_dict = {"K":dimension[0], "C":dimension[1], "Y":dimension[2], "X":dimension[3], "R":dimension[4],"S":dimension[5], "T":dimension[6]} - self.lastcluster_dict = {"K":dimension[0], "C":dimension[1], "Y":dimension[2], "X":dimension[3], "R":dimension[4],"S":dimension[5], "T":dimension[6]} + self.dimension_dict = {"K": dimension[0], "C": dimension[1], "Y": dimension[2], "X": dimension[3], + "R": dimension[4], "S": dimension[5], "T": dimension[6]} + self.lastcluster_dict = {"K": dimension[0], "C": dimension[1], "Y": dimension[2], "X": dimension[3], + "R": dimension[4], "S": dimension[5], "T": dimension[6]} if DEVELOP_MODE: path = "/usr/scratch/felix/my_code/HW_optimizer_rnn_result_history/maestro_his/maestro21/" if os.path.exists(path) is False: @@ -37,11 +44,11 @@ def __init__(self,dimension, map_cstr=None, num_pe=64, pe_limit=1024, fitness="l self.num_pe = num_pe self.pe_limit = pe_limit self.fitness_objective = fitness - self.cluster_space = ["K", "C", "Y","X","R","S"] if par_RS else ["K", "C", "Y","X"] - self.l1_size = l1_size if l1_size > 0 else 2**30 - self.l2_size = l2_size if l2_size > 0 else 2**30 - self.NocBW = NocBW if NocBW>0 else 2**30 - self.offchipBW = offchipBW if offchipBW > 0 else 2**30 + self.cluster_space = ["K", "C", "Y", "X", "R", "S"] if par_RS else ["K", "C", "Y", "X"] + self.l1_size = l1_size if l1_size > 0 else 2 ** 30 + self.l2_size = l2_size if l2_size > 0 else 2 ** 30 + self.NocBW = NocBW if NocBW > 0 else 2 ** 30 + self.offchipBW = offchipBW if offchipBW > 0 else 2 ** 30 self.slevel_min = slevel_min self.slevel_max = slevel_max self.fixedCluster = fixedCluster @@ -53,26 +60,29 @@ def __init__(self,dimension, map_cstr=None, num_pe=64, pe_limit=1024, fitness="l self.stat = None self.dimension_factors = self.get_dimension_factors(self.dimension_dict) self.use_ranking = True if self.fitness_objective[0] == "ranking" else False - self.constraints=constraints - self.constraint_class=constraint_class + self.constraints = constraints + self.constraint_class = constraint_class self.external_mem_cstr = external_mem_cstr self.use_factor = use_factor self.uni_base = uni_base self.L1_bias_template = None - self.area_pebuf_only=False + self.area_pebuf_only = False self.external_area_model = False + self.precision = None - def reset_hw_parm(self, l1_size=None, l2_size=None, num_pe=None, NocBW=None, map_cstr=None, pe_limit=None,area_pebuf_only=None, external_area_model=None, offchipBW=None, slevel_max=None, slevel_min=None): + def reset_hw_parm(self, l1_size=None, l2_size=None, num_pe=None, NocBW=None, map_cstr=None, pe_limit=None, + area_pebuf_only=None, external_area_model=None, offchipBW=None, slevel_max=None, slevel_min=None, + precision=None): if l1_size: - self.l1_size=l1_size if l1_size > 0 else 2**30 + self.l1_size = l1_size if l1_size > 0 else 2 ** 30 if l2_size: - self.l2_size=l2_size if l2_size > 0 else 2**30 + self.l2_size = l2_size if l2_size > 0 else 2 ** 30 if num_pe: - self.num_pe=num_pe + self.num_pe = num_pe if NocBW: - self.NocBW=NocBW if NocBW > 0 else 2**30 + self.NocBW = NocBW if NocBW > 0 else 2 ** 30 if offchipBW: - self.offchipBW=offchipBW if offchipBW > 0 else 2**30 + self.offchipBW = offchipBW if offchipBW > 0 else 2 ** 30 if map_cstr: self.map_cstr = map_cstr if pe_limit: @@ -87,20 +97,23 @@ def reset_hw_parm(self, l1_size=None, l2_size=None, num_pe=None, NocBW=None, map self.slevel_min = slevel_min if slevel_max: self.slevel_max = slevel_max + if precision: + self.precision = precision def get_dimension_factors(self, dimension_dict): dimension_factors = dict() for key, value in dimension_dict.items(): if key != "T": factors = self.get_factors(value) - dimension_factors[key] = {"set":factors, "array":np.array(list(factors))} + dimension_factors[key] = {"set": factors, "array": np.array(list(factors))} return dimension_factors - def reset_dimension(self, dimension=None, fitness=None, constraints=None, constraint_class=None, external_mem_cstr=None): + def reset_dimension(self, dimension=None, fitness=None, constraints=None, constraint_class=None, + external_mem_cstr=None): if dimension is not None: self.dimension = dimension if fitness is not None: - self.fitness_objective = fitness + self.fitness_objective = fitness if constraints is not None: self.constraints = constraints if constraint_class is not None: @@ -108,7 +121,9 @@ def reset_dimension(self, dimension=None, fitness=None, constraints=None, constr if external_mem_cstr is not None: self.external_mem_cstr = external_mem_cstr self.use_ranking = True if self.fitness_objective[0] == "ranking" else False - self.dimension_dict = {"K": self.dimension[0], "C": self.dimension[1], "Y": self.dimension[2], "X": self.dimension[3], "R": self.dimension[4],"S": self.dimension[5], "T": self.dimension[6]} + self.dimension_dict = {"K": self.dimension[0], "C": self.dimension[1], "Y": self.dimension[2], + "X": self.dimension[3], "R": self.dimension[4], "S": self.dimension[5], + "T": self.dimension[6]} self.dimension_factors = self.get_dimension_factors(self.dimension_dict) def create_genome_with_cstr(self): @@ -118,14 +133,24 @@ def create_genome_with_cstr(self): self.map_cstr.create_from_constraint(indv, self.fixedCluster, self.dimension_dict) return indv - def create_genome(self, uni_base=False,last_cluster_dict=None, l1_bias_template=None): + def create_genome_fixedSL(self, bias=None): + if self.map_cstr: + return self.create_genome_with_cstr() + ind = self.create_genome() + for _ in range(self.slevel_min - 1): + ind = self.born_cluster_ind(ind) + if bias: + ind = self.biased_init(ind, bias=bias) + return ind + + def create_genome(self, uni_base=False, last_cluster_dict=None, l1_bias_template=None, ind_level=0): if uni_base: if l1_bias_template: K, C, Y, X, R, S = l1_bias_template else: - K,C,Y,X,R,S,T = [1]*len(self.dimension) + K, C, Y, X, R, S, T = [1] * len(self.dimension) else: - K,C,Y,X,R,S,T = self.dimension + K, C, Y, X, R, S, T = self.dimension if uni_base is False and last_cluster_dict: K = last_cluster_dict["K"] C = last_cluster_dict["C"] @@ -135,52 +160,95 @@ def create_genome(self, uni_base=False,last_cluster_dict=None, l1_bias_template= S = last_cluster_dict["S"] sp = random.choice(self.cluster_space) lastcluster_sz = last_cluster_dict[sp] if last_cluster_dict else self.dimension_dict[sp] - if uni_base == True: - if self.fixedCluster>0: - sp_sz = self.fixedCluster + if self.slevel_max == 3: + if uni_base: + sp_sz = self.sp_sz_precision_based_3_level() + limited_cluster_space = self.cluster_space[1:4] + sp = random.choice(limited_cluster_space) else: - if self.num_pe > 0: - sp_sz = random.randint(1, min(lastcluster_sz, self.num_pe)) - else: - sp_sz = random.randint(1, lastcluster_sz) + if ind_level == 0: + sp_sz = random.randint(1, self.num_pe if self.num_pe > 0 else self.pe_limit) + if ind_level == 1: + if self.fixedCluster > 0: + sp_sz = self.fixedCluster + else: + sp_sz = np.random.randint(2, self.num_pe // self.sp_sz_precision_based_3_level()) else: - sp_sz = random.randint(1, self.num_pe if self.num_pe > 0 else self.pe_limit) + if uni_base == True: + if self.fixedCluster > 0: + sp_sz = self.fixedCluster + else: + if self.num_pe > 0: + sp_sz = random.randint(1, min(lastcluster_sz, self.num_pe)) + else: + sp_sz = random.randint(1, lastcluster_sz) + else: + sp_sz = random.randint(1, self.num_pe if self.num_pe > 0 else self.pe_limit) if self.use_factor and not uni_base: - df = [["K", np.random.choice(self.dimension_factors["K"]["array"])], ["C",np.random.choice(self.dimension_factors["C"]["array"])], ["Y", np.random.choice(self.dimension_factors["Y"]["array"])], - ["X", np.random.choice(self.dimension_factors["X"]["array"])], ["R",np.random.choice(self.dimension_factors["R"]["array"])], ["S",np.random.choice(self.dimension_factors["S"]["array"])]] + df = [["K", np.random.choice(self.dimension_factors["K"]["array"])], + ["C", np.random.choice(self.dimension_factors["C"]["array"])], + ["Y", np.random.choice(self.dimension_factors["Y"]["array"])], + ["X", np.random.choice(self.dimension_factors["X"]["array"])], + ["R", np.random.choice(self.dimension_factors["R"]["array"])], + ["S", np.random.choice(self.dimension_factors["S"]["array"])]] else: if uni_base: - df = [["K", K], ["C", C], ["Y", Y],["X", X], ["R", R], ["S", S]] + df = [["K", K], ["C", C], ["Y", Y], ["X", X], ["R", R], ["S", S]] else: - df = [["K", random.randint(1, K)], ["C", random.randint(1, C)], ["Y", random.randint(1, Y)],["X", random.randint(1, X)], ["R", random.randint(1, R)], ["S", random.randint(1, S)]] + df = [["K", random.randint(1, K)], ["C", random.randint(1, C)], ["Y", random.randint(1, Y)], + ["X", random.randint(1, X)], ["R", random.randint(1, R)], ["S", random.randint(1, S)]] idx = np.random.permutation(len(df)) indv = [[sp, sp_sz]] + [df[i] for i in idx] return indv + def sp_sz_precision_based_3_level(self): + + if self.precision is None or self.precision == "FP32": + return 0 + if self.precision == "FP16": + return 2 + if self.precision == "FP8": + return 4 + if self.precision == "FP4": + return 8 + if self.precision == "FP2": + return 16 + if self.precision == "INT32": + return 0 + if self.precision == "INT16": + return 2 + if self.precision == "INT8": + return 4 + if self.precision == "INT4": + return 8 + if self.precision == "INT2": + return 16 + def search_loc(self, segment_of_indv, dim): for i in range(len(segment_of_indv)): - if segment_of_indv[i][0]==dim: + if segment_of_indv[i][0] == dim: return i - def validTo_external_mem_cstr(self, indv,num_pe=1024): + def validTo_external_mem_cstr(self, indv, num_pe=1024): if not self.external_mem_cstr: return True - mem_used = self.compute_l1_l2_mem_size(indv,num_pe=num_pe) + mem_used = self.compute_l1_l2_mem_size(indv, num_pe=num_pe) for key, value in self.external_mem_cstr.items(): - if mem_used[key]> value: + if mem_used[key] > value: return False return True def compute_l1_l2_mem_size(self, indv, num_pe=1024): mem = {} + def get_w_i_o_size(picks, level=1, num_pe=1024): - if level==2: + if level == 2: sp_dim_L2 = indv[0][0] sp_dim_size_L2 = picks[sp_dim_L2] dim = self.dimension_dict[sp_dim_L2] sp_sz = indv[7][1] - num_cluster = num_pe//sp_sz - needed_iters = ceil(dim/sp_dim_size_L2) + num_cluster = num_pe // sp_sz + needed_iters = ceil(dim / sp_dim_size_L2) actual_sp_tile_size = min(dim, sp_dim_size_L2 * min(needed_iters, num_cluster)) picks[sp_dim_L2] = actual_sp_tile_size @@ -188,7 +256,8 @@ def get_w_i_o_size(picks, level=1, num_pe=1024): input = picks["C"] * picks["Y"] * picks["X"] output = picks["K"] * picks["Y"] * picks["X"] return weight, input, output - weight, input, output = get_w_i_o_size(picks=self.scan_indv(indv[0:7]), level=2,num_pe=num_pe) + + weight, input, output = get_w_i_o_size(picks=self.scan_indv(indv[0:7]), level=2, num_pe=num_pe) mem[f"L2-W"] = weight mem[f"L2-I"] = input mem[f"L2-O"] = output @@ -200,16 +269,16 @@ def get_w_i_o_size(picks, level=1, num_pe=1024): mem[f"L1-soft"] = output + input + weight return mem - def biased_init(self, indv, bias = None): + def biased_init(self, indv, bias=None): if bias is None: return indv if "par" in bias: for key, value in bias["par"].items(): - pointer = (key-1) * 7 + pointer = (key - 1) * 7 indv[pointer][0] = value if "order" in bias: for key, value in bias["order"].items(): - st, end = (key-1)*7+1, (key)*7 + st, end = (key - 1) * 7 + 1, (key) * 7 temp_indv = copy.deepcopy(indv[st: end]) for di in value[::-1]: idx = self.search_loc(temp_indv, di) @@ -218,7 +287,7 @@ def biased_init(self, indv, bias = None): indv[st: end] = temp_indv if "tiles" in bias: for key, value in bias["tiles"].items(): - st, end = (key-1)*7+1, (key)*7 + st, end = (key - 1) * 7 + 1, (key) * 7 temp_indv = copy.deepcopy(indv[st: end]) if key == 1: last_cluster_dict = self.dimension_dict @@ -227,27 +296,17 @@ def biased_init(self, indv, bias = None): for i in range(len(temp_indv)): dim = temp_indv[i][0] if dim in value: - new_tile = max(1, int(last_cluster_dict[dim]* value[dim])) + new_tile = max(1, int(last_cluster_dict[dim] * value[dim])) temp_indv[i][1] = new_tile indv[st: end] = temp_indv return indv - def create_genome_fixedSL(self, bias = None): - if self.map_cstr: - return self.create_genome_with_cstr() - ind = self.create_genome() - for _ in range(self.slevel_min-1): - ind = self.born_cluster_ind(ind) - if bias: - ind = self.biased_init(ind, bias=bias) - return ind - def select_parents(self, pop, fitness, num_parents, num_population): #=====sel unique====================== pop_set = set() to_saved_idx = [] for i in range(len(pop)): - cur_cand = tuple([tt for i, t in enumerate(pop[i]) for j, tt in enumerate(t) if (i, j) != (0, 1)]) + cur_cand = tuple([tt for i, t in enumerate(pop[i]) for j, tt in enumerate(t) if (i, j) != (0, 1)]) if cur_cand not in pop_set: pop_set.add(cur_cand) to_saved_idx.append(i) @@ -257,10 +316,10 @@ def select_parents(self, pop, fitness, num_parents, num_population): #===================================== if self.normalize: - norm_fitness = fitness/np.abs(np.nanmean(np.ma.masked_equal(fitness, value=float("-Inf")), axis=0)) + norm_fitness = fitness / np.abs(np.nanmean(np.ma.masked_equal(fitness, value=float("-Inf")), axis=0)) fitness_list = [tuple([-np.prod(ar[1:]), -i]) for i, ar in enumerate(norm_fitness)] else: - fitness_list = [tuple(list(ar)+[-i]) for i, ar in enumerate(fitness)] + fitness_list = [tuple(list(ar) + [-i]) for i, ar in enumerate(fitness)] fitness_list = sorted(fitness_list, reverse=True) idx = [int(-ar[-1]) for ar in fitness_list] new_pop = [pop[i] for i in idx][:num_population] @@ -269,20 +328,21 @@ def select_parents(self, pop, fitness, num_parents, num_population): if self.use_pleteau: num_pletau = self.build_pleteau(fitness, pop) # print(f"Num pleteau: {num_pletau}") - fitness_list = [tuple([*ar[:len(self.fitness_objective)], *ar]) for i, ar in enumerate(self.pleteau_sol.keys())] + fitness_list = [tuple([*ar[:len(self.fitness_objective)], *ar]) for i, ar in + enumerate(self.pleteau_sol.keys())] fitness_list = sorted(fitness_list, reverse=True) idx = [tuple(ar[-len(self.fitness_objective):]) for ar in fitness_list] new_pop[num_pletau:] = new_pop[:-num_pletau] - new_pop[:num_pletau] =[self.pleteau_sol[i] for i in idx] + new_pop[:num_pletau] = [self.pleteau_sol[i] for i in idx] new_fitness[num_pletau:] = new_fitness[:-num_pletau] - new_fitness[:num_pletau] =[i for i in idx] - parents = copy.deepcopy(new_pop[:num_parents+num_pletau]) + new_fitness[:num_pletau] = [i for i in idx] + parents = copy.deepcopy(new_pop[:num_parents + num_pletau]) self.best_reward_pleteau = copy.deepcopy(new_fitness[:num_pletau]) self.best_sol_pleteau = copy.deepcopy(new_pop[:num_pletau]) return new_pop, new_fitness, parents - def mutate_par(self, pop,alpha=0.5): + def mutate_par(self, pop, alpha=0.5): if self.map_cstr is not None: return for idx in range(len(pop)): @@ -291,19 +351,19 @@ def mutate_par(self, pop,alpha=0.5): # avail_val = self.num_free_par + self.num_free_order - 1 # else: # avail_val = len(indv) - 1 - # ##===ad hoc trial========= + # ##===ad hoc trial========= pop[idx][7][0], pop[idx][0][0] = pop[idx][0][0], pop[idx][7][0] continue - # #========================= + # #========================= pick = random.randint(0, avail_val) - pick_level = pick//7 - pick = int(pick_level *7) - if self.map_cstr and "sp" in self.cstr_list[pick_level]: + pick_level = pick // 7 + pick = int(pick_level * 7) + if self.map_cstr and "sp" in self.cstr_list[pick_level]: choices = self.cstr_list[pick_level]["sp"] else: choices = self.cluster_space sp = random.choice(choices) - if self.map_cstr and "sp_sz" in self.cstr_list[pick_level]: + if self.map_cstr and "sp_sz" in self.cstr_list[pick_level]: sp_sz = self.self.cstr_list[pick_level]["sp_sz"] else: if self.fixedCluster < 1: @@ -325,7 +385,7 @@ def mutate_tile(self, pop, is_finetune=False, num_mu_loc=1, alpha=0.5, range_alp if random.random() < alpha: if self.map_cstr: num_free_tile = self.cstr_list[1]["num_free_tile"] - if num_free_tile==0: + if num_free_tile == 0: pick = random.randint(0, len(indv) - 6 - 1) else: pick = random.randint(0, len(indv) - 1) @@ -334,33 +394,52 @@ def mutate_tile(self, pop, is_finetune=False, num_mu_loc=1, alpha=0.5, range_alp if cluster_only: pick = 7 if pick % 7 == 0: - if self.map_cstr and "sp" in self.cstr_list[pick // 7]: + if self.map_cstr and "sp" in self.cstr_list[pick // 7]: choices = self.cstr_list[pick // 7]["sp"] else: choices = self.cluster_space - sp = random.choice(choices) - if pick>0: - if self.map_cstr and "sp_sz" in self.cstr_list[pick // 7]: - sp_sz = self.cstr_list[pick // 7]["sp_sz"] - else: + if self.slevel_max == 3 and pick >= 7: + if pick // 7 == 1: if self.fixedCluster < 1: - last_cluster_dict = self.scan_indv(indv[:-7]) if pick != 0 else None - lastcluster_sz = last_cluster_dict[sp] if last_cluster_dict else self.dimension_dict[sp] - if self.num_pe > 0: - # sp_sz = max(1, random.randint(0, min(lastcluster_sz, self.num_pe))) - sp_sz = max(1, random.choice(list(self.get_factors(min(lastcluster_sz, self.num_pe))))) - else: - # sp_sz = max(1, random.randint(0, min(lastcluster_sz, indv[0][1]))) - sp_sz = max(1, random.choice(list(self.get_factors(min(lastcluster_sz, indv[0][1]))))) + sp_sz = np.random.randint(2, self.num_pe // self.sp_sz_precision_based_3_level()) + sp = random.choice(choices) else: sp_sz = self.fixedCluster + sp = random.choice(choices) + if pick // 7 == 2: + sp_sz = self.sp_sz_precision_based_3_level() + sp = random.choice(choices[1:4]) else: - sp_sz =pop[idx][pick][1] + sp = random.choice(choices) + if pick > 0: + if self.map_cstr and "sp_sz" in self.cstr_list[pick // 7]: + sp_sz = self.cstr_list[pick // 7]["sp_sz"] + else: + if self.fixedCluster < 1: + last_cluster_dict = self.scan_indv(indv[:-7]) if pick != 0 else None + lastcluster_sz = last_cluster_dict[sp] if last_cluster_dict else \ + self.dimension_dict[sp] + if self.num_pe > 0: + # sp_sz = max(1, random.randint(0, min(lastcluster_sz, self.num_pe))) + sp_sz = max(1, random.choice( + list(self.get_factors(min(lastcluster_sz, self.num_pe))))) + else: + # sp_sz = max(1, random.randint(0, min(lastcluster_sz, indv[0][1]))) + sp_sz = max(1, random.choice( + list(self.get_factors(min(lastcluster_sz, indv[0][1]))))) + else: + sp_sz = self.fixedCluster + else: + sp_sz = pop[idx][pick][1] + pop[idx][pick] = [sp, sp_sz] else: d, d_sz = indv[pick] if pick > 7: - last_cluster_dict = self.scan_indv(indv[:-7]) + if 7 <= pick < 14: + last_cluster_dict = self.scan_indv(indv[7:14]) + elif pick >= 14: + last_cluster_dict = self.scan_indv(indv[:-7]) thr = last_cluster_dict[d] if self.use_factor is False: new_d_sz = random.randint(1, thr) @@ -381,33 +460,35 @@ def mutate_tile(self, pop, is_finetune=False, num_mu_loc=1, alpha=0.5, range_alp new_d_sz = max(1, min(new_d_sz, self.dimension_dict[d])) pop[idx][pick][1] = new_d_sz + def mutate_pe(self, pop, alpha=0.5, mutate_range_ratio=0.5): for idx in range(len(pop)): - if len(pop[idx])<=7: + if len(pop[idx]) <= 7: if random.random() < alpha: pop[idx][0][1] = random.randint(1, self.pe_limit) else: - sp , sp_sz, *a = pop[idx][7] - cur_multiplier = pop[idx][0][1]//sp_sz - if random.random()< alpha: + sp, sp_sz, *a = pop[idx][7] + cur_multiplier = pop[idx][0][1] // sp_sz + if random.random() < alpha: if self.use_factor is False: #==method 1 last_cluster_dict = self.scan_indv(pop[idx][:7]) last_cluster_dict_sz = last_cluster_dict[sp] max_multiplier = max(1, self.pe_limit // sp_sz) - cur_multiplier = random.randint(1, min(max_multiplier, ceil(self.dimension_dict[sp]/last_cluster_dict_sz))) + cur_multiplier = random.randint(1, min(max_multiplier, + ceil(self.dimension_dict[sp] / last_cluster_dict_sz))) # ====constrained to smaller search space==== - max_value = min(max_multiplier, ceil(self.dimension_dict[sp]/last_cluster_dict_sz)) - cur_multiplier = random.randint(max(1, int(max_value*mutate_range_ratio)), max_value) + max_value = min(max_multiplier, ceil(self.dimension_dict[sp] / last_cluster_dict_sz)) + cur_multiplier = random.randint(max(1, int(max_value * mutate_range_ratio)), max_value) #============================================ else: #method 2 factors = self.dimension_factors[sp]["array"] max_multiplier = max(1, self.pe_limit // sp_sz) - factors = factors[(factors<= max_multiplier)] + factors = factors[(factors <= max_multiplier)] cur_multiplier = random.choice(factors) # ====constrained to smaller search space==== - cur_multiplier = random.choice(factors[int(len(factors)*mutate_range_ratio):]) + cur_multiplier = random.choice(factors[int(len(factors) * mutate_range_ratio):]) #============================================ cur_pe = min(self.pe_limit, cur_multiplier * sp_sz) pop[idx][0][1] = cur_pe @@ -419,23 +500,23 @@ def swap_order(self, pop, alpha=0.5): return while max_count > 0: max_count -= 1 - if random.random()< alpha: + if random.random() < alpha: idx = random.randint(0, len(pop) - 1) if self.map_cstr is None: - sel_cluster = random.randint(0, (len(pop[idx])-1)//7) - swap_id = np.random.randint(1, 6+1, (2,)) + sel_cluster * 7 + sel_cluster = random.randint(0, (len(pop[idx]) - 1) // 7) + swap_id = np.random.randint(1, 6 + 1, (2,)) + sel_cluster * 7 else: - sel_cluster = random.randint(0, (self.num_free_order-1)//6) - num_free_order = (self.num_free_order - sel_cluster*6 -1)%6 - swap_id = np.random.randint(1, 1+num_free_order+1, (2,)) + sel_cluster * 7 + sel_cluster = random.randint(0, (self.num_free_order - 1) // 6) + num_free_order = (self.num_free_order - sel_cluster * 6 - 1) % 6 + swap_id = np.random.randint(1, 1 + num_free_order + 1, (2,)) + sel_cluster * 7 pop[idx][swap_id[0]], pop[idx][swap_id[1]] = pop[idx][swap_id[1]], pop[idx][swap_id[0]] def crossover_tile(self, parents, pop, alpha=0.5): - if len(parents) ==1: + if len(parents) == 1: for idx in range(len(pop)): pop[idx] = copy.deepcopy(parents[0]) else: - for idx in range(0,len(pop),2): + for idx in range(0, len(pop), 2): pick_range = np.random.permutation(np.arange(0, len(parents))) dad, mom = parents[pick_range[0]], parents[pick_range[1]] # dad, mom = parents[random.randint(0, len(parents)-1)], parents[random.randint(0, len(parents)-1)] @@ -445,9 +526,9 @@ def crossover_tile(self, parents, pop, alpha=0.5): if random.random() < alpha: cross_point = random.choice(["K", "C", "Y", "X", "R", "S"]) for k in range(0, length, 7): - for i in range(k+1, k+7): + for i in range(k + 1, k + 7): d, d_sz = dad[i] - if d== cross_point: + if d == cross_point: dad_sz = d_sz dad_idx = i d, d_sz = mom[i] @@ -458,12 +539,12 @@ def crossover_tile(self, parents, pop, alpha=0.5): mom[mom_idx][1] = dad_sz pop[idx] = dad if idx + 1 < len(pop): - pop[idx+1] = mom + pop[idx + 1] = mom - def check_tile_dependency(self, pop): + def check_tile_dependency(self, pop): for idx in range(0, len(pop)): cur_pop = pop[idx] - last_cluster =self.scan_indv(cur_pop) + last_cluster = self.scan_indv(cur_pop) first_cluster = self.scan_indv(cur_pop[:7]) for key in ["K", "C", "Y", "X", "R", "S"]: if last_cluster[key] > first_cluster[key]: @@ -473,14 +554,14 @@ def correctify_tile_dependency(self, pop): for i in range(0, len(pop)): ind = pop[i] cur_cluster = None - levels = len(ind)//7 + levels = len(ind) // 7 for i in range(levels): last_cluster = copy.deepcopy(cur_cluster) - cur_cluster = self.scan_indv(ind[7*i:7*(i+1)]) + cur_cluster = self.scan_indv(ind[7 * i:7 * (i + 1)]) if i == 0: continue else: - for idx in range(7*i+1, 7*(i+1)): + for idx in range(7 * i + 1, 7 * (i + 1)): d, d_sz = ind[idx] d_sz = min(last_cluster[d], d_sz) ind[idx][1] = d_sz @@ -488,26 +569,38 @@ def correctify_tile_dependency(self, pop): def correctify_tile_dependency_thread(self, indv): ind = copy.deepcopy(indv) cur_cluster = None - levels = len(ind)//7 + levels = len(ind) // 7 for i in range(levels): last_cluster = copy.deepcopy(cur_cluster) - cur_cluster = self.scan_indv(ind[7*i:7*(i+1)]) + cur_cluster = self.scan_indv(ind[7 * i:7 * (i + 1)]) if i == 0: continue else: - for idx in range(7*i+1, 7*(i+1)): + for idx in range(7 * i + 1, 7 * (i + 1)): d, d_sz = ind[idx] d_sz = min(last_cluster[d], d_sz) ind[idx][1] = d_sz return ind def born_cluster_ind(self, ind): - if (len(ind)) // 7 < self.slevel_max: + ind_level = len(ind) // 7 + if ind_level < self.slevel_max: last_cluster_dict = self.scan_indv(ind) - new_ind = ind + self.create_genome(uni_base=self.uni_base, l1_bias_template=self.L1_bias_template, last_cluster_dict=last_cluster_dict) + if self.slevel_max == 3: + if ind_level == 1: + new_ind = ind + self.create_genome(l1_bias_template=self.L1_bias_template, + last_cluster_dict=last_cluster_dict, ind_level=1) + else: + new_ind = ind + self.create_genome(uni_base=self.uni_base, l1_bias_template=self.L1_bias_template, + last_cluster_dict=last_cluster_dict) + else: + new_ind = ind + self.create_genome(uni_base=self.uni_base, l1_bias_template=self.L1_bias_template, + last_cluster_dict=last_cluster_dict) + ind = new_ind return ind + def born_cluster(self, pop, alpha=0.1): max_count = len(pop) while max_count > 0: @@ -523,15 +616,15 @@ def kill_cluster(self, pop, alpha=0.5): max_count -= 1 if random.random() < alpha: idx = random.randint(0, len(pop) - 1) - if (len(pop[idx]))//7>self.slevel_min: + if (len(pop[idx])) // 7 > self.slevel_min: pop[idx] = pop[idx][:-7] - def scan_indv(self,indv): - last_cluster_dict=defaultdict(str) - for i in range(len(indv)-6,len(indv), 1): + def scan_indv(self, indv): + last_cluster_dict = defaultdict(str) + for i in range(len(indv) - 6, len(indv), 1): d, d_sz = indv[i] last_cluster_dict[d] = d_sz - return last_cluster_dict + return last_cluster_dict def get_out_repr(self, x): if x in self.out_repr: @@ -556,12 +649,13 @@ def create_unit_base_pops(self, population, num_all_unit=None): for i in range(1, 7): population[idx][i + level * 7][1] = 1 - def reinit_pop(self,pool, num_population, stage_idx, best_sol_1st, init_pop, cur_gen=-1, bias= None, num_all_unit=2, precision=None): + def reinit_pop(self, pool, num_population, stage_idx, best_sol_1st, init_pop, cur_gen=-1, bias=None, num_all_unit=2, + precision=None): population = [self.create_genome_fixedSL(bias=bias) for _ in range(num_population)] - #====always create a base unit pop======= + # ====always create a base unit pop======= self.create_unit_base_pops(population, num_all_unit=num_all_unit) - #======================================== + # ======================================== if init_pop is not None: # population = [self.create_genome_fixedSL() for _ in range(num_population)] if best_sol_1st is None else [best_sol_1st for _ in range(num_population)] population[:10] = init_pop[:10] @@ -574,17 +668,15 @@ def reinit_pop(self,pool, num_population, stage_idx, best_sol_1st, init_pop, cu self.comform_to_cstr(population) self.fitness = np.ones((max(num_population, len(population)), len(self.fitness_objective)), float) - self.evaluate(pool=pool, population=population,cur_gen=cur_gen, precision=precision) + self.evaluate(pool=pool, population=population, cur_gen=cur_gen, precision=precision) return population - - def cal_statstics(self): fitness = np.array(self.fitness) - reward = fitness[:,0] - sel_valid = reward>float("-Inf") - latency_ave = np.mean(-fitness[sel_valid, 0]) - area_ave = np.mean(-fitness[sel_valid, 1]) + reward = fitness[:, 0] + sel_valid = reward > float("-Inf") + latency_ave = np.mean(-fitness[sel_valid, 0]) + area_ave = np.mean(-fitness[sel_valid, 1]) l1_size_ave = np.mean(-fitness[sel_valid, 2]) l2_size_ave = np.mean(-fitness[sel_valid, 3]) # l1_size_pops = -np.array(self.l1_size_pop) @@ -592,10 +684,10 @@ def cal_statstics(self): # l1_size_pops = -l1_size_pops[l1_size_pops>float("-Inf")] # l2_size_pops = -l2_size_pops[l2_size_pops>float("-Inf")] statstics = { - "latency_ave":latency_ave, - "area_ave":area_ave, - "l1_size_ave":l1_size_ave, - "l2_size_ave":l2_size_ave + "latency_ave": latency_ave, + "area_ave": area_ave, + "l1_size_ave": l1_size_ave, + "l2_size_ave": l2_size_ave } self.stat = statstics return statstics @@ -604,7 +696,7 @@ def cal_pletau_stat(self): fitness = np.array(list(self.pleteau_sol.keys())) fitness = np.mean(fitness, axis=0) stats = { - "fitness":fitness, + "fitness": fitness, "Reward": fitness[0], "latency": fitness[1], "area": fitness[2], @@ -613,7 +705,7 @@ def cal_pletau_stat(self): } return stats - def build_pleteau(self,fitness, population): + def build_pleteau(self, fitness, population): self.pleteau_sol = dict() for cand_fit, cand_sol in zip(fitness, population): self.insert_into_pleteau(cand_fit, cand_sol) @@ -621,14 +713,14 @@ def build_pleteau(self,fitness, population): def insert_into_pleteau(self, cand_fit, cand_sol): reject = False - if np.prod(cand_fit>float("-inf"))!=1: + if np.prod(cand_fit > float("-inf")) != 1: return cand_fit = tuple(list(cand_fit)) for pl in set(self.pleteau_sol.keys()): - if all([cand_fit[i]< pl[i] for i in range(len(cand_fit))]): + if all([cand_fit[i] < pl[i] for i in range(len(cand_fit))]): del self.pleteau_sol[pl] self.pleteau_sol[cand_fit] = cand_sol - elif all([cand_fit[i]> pl[i] for i in range(len(cand_fit))]): + elif all([cand_fit[i] > pl[i] for i in range(len(cand_fit))]): reject = True if not reject: self.pleteau_sol[cand_fit] = cand_sol @@ -645,8 +737,8 @@ def adjust_fitness(self, fitness): rank2 = np.zeros((len(idx),)) rank2[idx] = -np.arange(len(idx)) rank = rank1 + rank2 - fitness[:,0] = rank - gen_best_idx = np.argmax(fitness[:,0]) + fitness[:, 0] = rank + gen_best_idx = np.argmax(fitness[:, 0]) return fitness, gen_best_idx def evaluate(self, pool, population, cur_gen=-1, precision=None): @@ -675,7 +767,7 @@ def evaluate(self, pool, population, cur_gen=-1, precision=None): gen_best_activity = activity_count gen_best_idx = i if self.use_ranking: - self.fitness, gen_best_idx = self.adjust_fitness(self.fitness) + self.fitness, gen_best_idx = self.adjust_fitness(self.fitness) gen_best = - np.prod(self.fitness[gen_best_idx][1:]) judging_best_reward = - np.prod(self.best_reward[1:]) else: @@ -701,13 +793,13 @@ def evaluate(self, pool, population, cur_gen=-1, precision=None): "num_generations": self.num_generations, "fitness_use": self.fitness_objective, "num_pe": self.num_pe, - "pe_limit":self.pe_limit, + "pe_limit": self.pe_limit, "l1_size": self.l1_size, "l2_size": self.l2_size, "NocBW": self.NocBW, "dimension": self.dimension, - "best_reward_pleteau":self.best_reward_pleteau , - "best_sol_pleteau":self.best_sol_pleteau , + "best_reward_pleteau": self.best_reward_pleteau, + "best_sol_pleteau": self.best_sol_pleteau, # "stat":stat, # "stat_list":self.stat_list } @@ -731,20 +823,37 @@ def injection(self, inject_ratio=1.0): return pop_inj, inj_fitness def run(self, dimension, stage_idx=0, prev_stage_value=0, num_population=100, num_generations=100, elite_ratio=0.05, - parents_ratio=0.4, ratio_decay=1, num_finetune=1, best_sol_1st=None, init_pop=None, bias=None, uni_base=True, use_factor=False, use_pleteau=False, L1_bias_template=None, precision=None): - self.init_arguement(dimension=dimension, stage_idx=stage_idx, prev_stage_value=prev_stage_value, num_population=num_population, num_generations=num_generations, elite_ratio=elite_ratio, - parents_ratio=parents_ratio, ratio_decay=ratio_decay, num_finetune=num_finetune, best_sol_1st=best_sol_1st, init_pop=init_pop,uni_base=uni_base, use_factor=use_factor, use_pleteau=use_pleteau,L1_bias_template=L1_bias_template) + parents_ratio=0.4, ratio_decay=1, num_finetune=1, best_sol_1st=None, init_pop=None, bias=None, + uni_base=True, use_factor=False, use_pleteau=False, L1_bias_template=None): + self.init_arguement(dimension=dimension, stage_idx=stage_idx, prev_stage_value=prev_stage_value, + num_population=num_population, num_generations=num_generations, elite_ratio=elite_ratio, + parents_ratio=parents_ratio, ratio_decay=ratio_decay, num_finetune=num_finetune, + best_sol_1st=best_sol_1st, init_pop=init_pop, uni_base=uni_base, use_factor=use_factor, + use_pleteau=use_pleteau, L1_bias_template=L1_bias_template) pool = Pool(min(self.num_population + self.num_elite, cpu_count())) - population = self.reinit_pop(pool,self.num_population, self.stage_idx, self.best_sol_1st, self.init_pop, bias=bias, precision=precision) + population = self.reinit_pop(pool, self.num_population, self.stage_idx, self.best_sol_1st, self.init_pop, + bias=bias, precision=self.precision) if self.map_cstr: - self.cstr_list, self.num_free_order, self.num_free_par = self.map_cstr.get_cstr_list(copy.deepcopy(population[0]), fixed_sp_sz=self.fixedCluster) - for g in range(num_generations): + self.cstr_list, self.num_free_order, self.num_free_par = self.map_cstr.get_cstr_list( + copy.deepcopy(population[0]), fixed_sp_sz=self.fixedCluster) + + best_sol = None + no_change_counter = 0 + + if self.precision is None or self.precision == "FP32"or self.precision == "INT32": + num_gen = int(num_generations / 10) + else: + num_gen = int(num_generations) + + for g in range(num_gen): while self.num_parents < 1: # restart - population = self.reinit_pop(pool, self.num_population, self.stage_idx, self.best_sol_1st, self.init_pop, cur_gen=g, precision=precision) + population = self.reinit_pop(pool, self.num_population, self.stage_idx, self.best_sol_1st, + self.init_pop, cur_gen=g, precision=self.precision) print("Reinitialize population") - population, self.fitness, self.parents = self.select_parents(population, self.fitness, self.num_parents, self.num_population,) + population, self.fitness, self.parents = self.select_parents(population, self.fitness, self.num_parents, + self.num_population, ) elite = copy.deepcopy(self.parents[:self.num_elite]) self.elite_fitness = copy.deepcopy(self.fitness[:(len(elite))]) @@ -762,15 +871,13 @@ def run(self, dimension, stage_idx=0, prev_stage_value=0, num_population=100, nu else: self.swap_order(population, alpha=0.47) self.mutate_tile(population, num_mu_loc=3, range_alpha=0.53, alpha=0.53, is_finetune=False) - self.mutate_pe(population, alpha=1 if g==0 else 0.5) if self.num_pe<1 else None + self.mutate_pe(population, alpha=1 if g == 0 else 0.5) if self.num_pe < 1 else None self.mutate_par(population, alpha=0.1) - if self.map_cstr is None: self.born_cluster(population, alpha=0.57) self.kill_cluster(population, alpha=0.27) - # pop_inj, inj_fitness = self.injection() self.correctify_tile_dependency(population) # self.calculate_equivalent_num_pe(population) @@ -779,17 +886,30 @@ def run(self, dimension, stage_idx=0, prev_stage_value=0, num_population=100, nu # population = elite + population + pop_inj self.fitness = np.concatenate((self.elite_fitness, self.fitness)) # self.fitness = np.concatenate((self.elite_fitness, self.fitness, inj_fitness)) - chkpt = self.evaluate(pool=pool, population=population, cur_gen=g, precision=precision) + chkpt = self.evaluate(pool=pool, population=population, cur_gen=g, precision=self.precision) # self.check_tile_dependency(population) - if self.log_level>1: - if chkpt["best_sol"] is not None and self.log_level>1: - best_runtime, best_throughput, best_energy, best_area, best_l1_size, best_l2_size, best_mac, best_power, best_num_pe = self.get_indiv_info( chkpt["best_sol"], precision=precision) + if self.log_level > 1: + if chkpt["best_sol"] is not None and self.log_level > 1: + (best_runtime, best_throughput, best_energy, best_area, best_l1_size, best_l2_size, best_mac, + best_power, best_num_pe, best_l1_read, best_l1_write, best_l2_read, best_l2_write, best_avg_pe, + best_avg_bw) = self.get_indiv_info(chkpt["best_sol"], precision=self.precision) # best_num_pe = chkpt["best_sol"][0][1] if self.num_pe<1 else self.num_pe # print(f"Runtime: {best_runtime}, L1: {best_l1_size}, L2: {best_l2_size}, L1_usage:{best_l1_size/self.l1_size:}, L2_usage:{best_l2_size/self.l2_size:.4f}, PE: {best_num_pe}") - print(f"Gen {g+1}: Reward: {chkpt['best_reward'][0]:.3e}, Runtime: {best_runtime}, Area: {best_area/1e6:.3f}mm2, PE Area_ratio: {best_num_pe*MAC_AREA_INT8/best_area*100:.1f}%, L1: {best_l1_size}, L2: {best_l2_size}, PE: {best_num_pe}") + print( + f"Gen {g + 1}: Reward: {chkpt['best_reward'][0]:.3e}, Runtime: {best_runtime}, Area: {best_area / 1e6:.3f}mm2, PE Area_ratio: {best_num_pe * MAC_AREA_INT8 / best_area * 100:.1f}%, L1: {best_l1_size}, L2: {best_l2_size}, PE: {best_num_pe}") else: - print(f"Gen {g+1}: Reward: {chkpt['best_reward'][0]:.3e}") + print(f"Gen {g + 1}: Reward: {chkpt['best_reward'][0]:.3e}") + + if best_sol == chkpt["best_sol"]: + no_change_counter += 1 + else: + best_sol = chkpt["best_sol"] + no_change_counter = 0 + + if no_change_counter > 100: + print(f"No improvement for 100 generations. Terminating at generation {g}.") + break population = self.sort_population(population) pool.close() @@ -799,23 +919,23 @@ def calculate_equivalent_num_pe(self, population): for idx in range(len(population)): indv = population[idx] num_pe, sp_sz = indv[0][1], indv[7][1] - num_cluster = num_pe//sp_sz - sp_dim_L2_loc = [i for i, item in enumerate(indv) if item[0]==indv[0][0] and i%7!=0] + num_cluster = num_pe // sp_sz + sp_dim_L2_loc = [i for i, item in enumerate(indv) if item[0] == indv[0][0] and i % 7 != 0] sp_real_tile_sizeL2 = indv[sp_dim_L2_loc[0]][1] sp_real_tile_sizeL1 = indv[sp_dim_L2_loc[1]][1] if sp_real_tile_sizeL2 > num_cluster: - sp_dim_sp_sizeL2 = ceil(sp_real_tile_sizeL2/num_cluster) + sp_dim_sp_sizeL2 = ceil(sp_real_tile_sizeL2 / num_cluster) using_num_cluster = num_cluster else: using_num_cluster = sp_real_tile_sizeL2 sp_dim_sp_sizeL2 = 1 if sp_dim_sp_sizeL2 < sp_real_tile_sizeL1: sp_dim_sp_sizeL2 = sp_real_tile_sizeL1 - using_num_cluster = ceil(sp_real_tile_sizeL2/sp_dim_sp_sizeL2) + using_num_cluster = ceil(sp_real_tile_sizeL2 / sp_dim_sp_sizeL2) indv[0][1] = using_num_cluster * sp_sz indv[sp_dim_L2_loc[0]][1] = sp_dim_sp_sizeL2 - if indv[0][1]>self.pe_limit: + if indv[0][1] > self.pe_limit: print("error1") if indv[sp_dim_L2_loc[0]][1] * using_num_cluster < sp_real_tile_sizeL2: print("error2") @@ -823,27 +943,27 @@ def calculate_equivalent_num_pe(self, population): def sort_population(self, population): population, self.fitness, self.parents = self.select_parents(population, self.fitness, self.num_parents, - self.num_population,) + self.num_population, ) return population - def thread_fun_correctify_tile_dependency(self, indv): return self.correctify_tile_dependency_thread(indv) def thread_fun(self, individual, precision=None): - reward, activity_count = self.oberserve_maestro(individual, precision=precision) + reward, activity_count = self.observe_maestro(individual, precision=precision) return [reward, activity_count] def get_indiv_info(self, individual, num_pe=None, l1_size=None, l2_size=None, NocBW=None, precision=None): - self.oberserve_maestro(individual,num_pe=num_pe, l1_size=l1_size, l2_size=l2_size, NocBW=NocBW, precision=precision) + self.observe_maestro(individual, num_pe=num_pe, l1_size=l1_size, l2_size=l2_size, NocBW=NocBW, + precision=precision) return self.observation def get_CONVtypeShape(self, dimensions, CONVtype=1): CONVtype = CONVtype_dicts[CONVtype] - if CONVtype == "CONV"or CONVtype=="DSCONV": + if CONVtype == "CONV" or CONVtype == "DSCONV": pass - elif CONVtype == "GEMM" or CONVtype=="SGEMM": - SzM, SzN, SzK,*a = dimensions + elif CONVtype == "GEMM" or CONVtype == "SGEMM": + SzM, SzN, SzK, *a = dimensions dimensions = [SzN, SzK, SzM, 1, 1, 1] elif CONVtype == "FC": SzOut, SzIn, *a = dimensions @@ -852,7 +972,7 @@ def get_CONVtypeShape(self, dimensions, CONVtype=1): print("Not supported layer.") return dimensions - def write_maestro(self, indv, layer_id=0, m_file = None, folder_path = None, precision=None): + def write_maestro(self, indv, layer_id=0, m_file=None, folder_path=None, precision=None): dimensions = [self.dimension] if layer_id != 0: m_file_with_layer = "{}_{}".format(m_file, layer_id) @@ -885,7 +1005,7 @@ def write_maestro(self, indv, layer_id=0, m_file = None, folder_path = None, pre fo.write("Cluster({},P);\n".format(d_sz)) else: sp = "SpatialMap" if d == indv[k][0] or ( - len(indv[k]) > 2 and d == indv[k][2]) else "TemporalMap" + len(indv[k]) > 2 and d == indv[k][2]) else "TemporalMap" # MAESTRO cannot take K dimension as dataflow file if not (m_type == "DSCONV"): fo.write("{}({},{}) {};\n".format(sp, d_sz, d_sz, self.get_out_repr(d))) @@ -900,13 +1020,14 @@ def write_maestro(self, indv, layer_id=0, m_file = None, folder_path = None, pre fo.write("}\n") fo.write("}") - def oberserve_maestro(self, indv, num_pe=None, l1_size=None, l2_size=None, NocBW=None, offchipBW=None, precision=None): + def observe_maestro(self, indv, num_pe=None, l1_size=None, l2_size=None, NocBW=None, offchipBW=None, + precision=None): - m_file = "{}".format(random.randint(0, 2**32)) + m_file = "{}".format(random.randint(0, 2 ** 32)) self.write_maestro(indv, m_file=m_file, precision=precision) if num_pe: to_use_num_pe = self.num_pe_to_use(num_pe, precision) - elif self.num_pe <1: + elif self.num_pe < 1: to_use_num_pe = self.num_pe_to_use(indv[0][1], precision) else: to_use_num_pe = self.num_pe_to_use(self.num_pe, precision) @@ -919,8 +1040,10 @@ def oberserve_maestro(self, indv, num_pe=None, l1_size=None, l2_size=None, NocBW "--offchip_bw_cstr={}".format(self.offchipBW if not offchipBW else offchipBW), "--noc_mc_support=true", "--num_pes={}".format(int(to_use_num_pe)), "--num_simd_lanes=1", "--l1_size_cstr={}".format(self.l1_size if not l1_size else l1_size), - "--l2_size_cstr={}".format(self.l2_size if not l2_size else l2_size), "--print_res=false", "--print_res_csv_file=true", "--print_log_file=false", "--print_design_space=false", "--msg_print_lv=0"] -# "--num_simd_lanes=1", "--l1_size_cstr={}".format(int(self.l1_to_use(self.l1_size, precision)) if not l1_size else + "--l2_size_cstr={}".format(self.l2_size if not l2_size else l2_size), "--print_res=false", + "--print_res_csv_file=true", "--print_log_file=false", "--print_design_space=false", + "--msg_print_lv=0"] + # "--num_simd_lanes=1", "--l1_size_cstr={}".format(int(self.l1_to_use(self.l1_size, precision)) if not l1_size else # int(self.l1_to_use(l1_size, precision))), process = Popen(command, stdout=PIPE, stderr=PIPE) @@ -952,7 +1075,13 @@ def oberserve_maestro(self, indv, num_pe=None, l1_size=None, l2_size=None, NocBW l2_weight_write = np.array(df[" filter l2 write"]).reshape(-1, 1) l2_output_read = np.array(df[" output l2 read"]).reshape(-1, 1) l2_output_write = np.array(df[" output l2 write"]).reshape(-1, 1) + l1_read = l1_output_read + l1_weight_read + l1_input_read + l1_write = l1_output_write + l1_input_write + l1_weight_write + l2_read = l2_output_read + l2_weight_read + l2_input_read + l2_write = l2_output_write + l2_input_write + l2_weight_write mac = np.array(df[" Num MACs"]).reshape(-1, 1) + avg_pe_utilized = np.array(df["Avg number of utilized PEs"]).reshape(-1, 1) + avg_bw = np.array(df[" Avg BW Req"]).reshape(-1, 1) activity_count = {} activity_count["l1_input_read"] = l1_input_read activity_count["l1_input_write"] = l1_input_write @@ -967,7 +1096,7 @@ def oberserve_maestro(self, indv, num_pe=None, l1_size=None, l2_size=None, NocBW activity_count["l2_output_read"] = l2_output_read activity_count["l2_output_write"] = l2_output_write activity_count["mac_activity"] = mac - os.remove("./{}.csv".format(m_file)) if os.path.exists("./{}.csv".format(m_file)) else None + os.remove("./{}.csv".format(m_file)) if os.path.exists("./{}.csv".format(m_file)) else None os.remove("./log.txt") if os.path.exists("./log.txt") else None if self.external_area_model: area = self.compute_area_external(to_use_num_pe, l1_size, l2_size) @@ -975,18 +1104,22 @@ def oberserve_maestro(self, indv, num_pe=None, l1_size=None, l2_size=None, NocBW area = self.compute_area_maestro(to_use_num_pe, l1_size, l2_size) self.observation = [np.mean(x) for x in [runtime, throughput, energy, area, l1_size, l2_size, mac, power, - self.restore_num_pe(to_use_num_pe, precision)]] + self.restore_num_pe(to_use_num_pe, precision), l1_read, l1_write, + l2_read,l2_write, avg_pe_utilized, avg_bw]] + def catch_exception(): - if l1_size>self.l1_size or l2_size>self.l2_size or any(runtime_series<1) or any(l1_size_series<1) or any(l2_size_series<1): + if l1_size > self.l1_size or l2_size > self.l2_size or any(runtime_series < 1) or any( + l1_size_series < 1) or any(l2_size_series < 1): return True else: return False + stdout_as_str = stdout.decode("utf-8") stdout_as_str = "".join(stdout_as_str.split()) # if (len(str(stdout))>3 and stdout_as_str[:len("Numpartialsumsislessthan0!")]!="Numpartialsumsislessthan0!") or catch_exception() or not self.validTo_external_mem_cstr(indv, num_pe=to_use_num_pe): # if len(str(stdout))>3 or catch_exception() or not self.validTo_external_mem_cstr(indv, num_pe=to_use_num_pe): - if catch_exception() or not self.validTo_external_mem_cstr(indv, num_pe=to_use_num_pe): - # if catch_exception(): + if catch_exception() or not self.validTo_external_mem_cstr(indv, num_pe=to_use_num_pe): + # if catch_exception(): return None, None return self.judge(), activity_count except: @@ -1000,12 +1133,20 @@ def num_pe_to_use(self, num_pe, precision): return num_pe / 2 if precision == "FP8": return num_pe / 4 + if precision == "FP4": + return num_pe / 8 + if precision == "FP2": + return num_pe / 16 if precision == "INT32": return num_pe if precision == "INT16": return num_pe / 2 if precision == "INT8": return num_pe / 4 + if precision == "INT4": + return num_pe / 8 + if precision == "INT2": + return num_pe / 16 def restore_num_pe(self, num_pe, precision): @@ -1015,12 +1156,20 @@ def restore_num_pe(self, num_pe, precision): return num_pe * 2 if precision == "FP8": return num_pe * 4 + if precision == "FP4": + return num_pe * 8 + if precision == "FP2": + return num_pe * 16 if precision == "INT32": return num_pe if precision == "INT16": return num_pe * 2 if precision == "INT8": return num_pe * 4 + if precision == "INT4": + return num_pe * 8 + if precision == "INT2": + return num_pe * 16 def l1_to_use(self, l1_size, precision): @@ -1036,7 +1185,8 @@ def l1_to_use(self, l1_size, precision): return l1_size * 2 if precision == "INT8": return l1_size * 4 - def impose_halloffame(self, observe_value, target="latency_ave" ): + + def impose_halloffame(self, observe_value, target="latency_ave"): is_violated = False if self.stat is not None: target_value = self.stat[target] @@ -1045,24 +1195,26 @@ def impose_halloffame(self, observe_value, target="latency_ave" ): return is_violated def compute_area_maestro(self, num_pe, l1_size, l2_size): - MAC_AREA_MAESTRO=4470 + MAC_AREA_MAESTRO = 4470 L2BUF_AREA_MAESTRO = 4161.536 L1BUF_AREA_MAESTRO = 4505.1889 L2BUF_UNIT = 32768 L1BUF_UNIT = 64 - area = num_pe * MAC_AREA_MAESTRO + ceil(int(l2_size)/L2BUF_UNIT)*L2BUF_AREA_MAESTRO + ceil(int(l1_size)/L1BUF_UNIT)*L1BUF_AREA_MAESTRO * num_pe + area = num_pe * MAC_AREA_MAESTRO + ceil(int(l2_size) / L2BUF_UNIT) * L2BUF_AREA_MAESTRO + ceil( + int(l1_size) / L1BUF_UNIT) * L1BUF_AREA_MAESTRO * num_pe return area def compute_area_external(self, num_pe, l1_size, l2_size): - MAC_AREA_INT8=282 - MAC_AREA_INT32=3495 + MAC_AREA_INT8 = 282 + MAC_AREA_INT32 = 3495 BUF_AREA_perbit = 0.086 buf_size = l1_size * num_pe + l2_size area = num_pe * MAC_AREA_INT8 + buf_size * BUF_AREA_perbit * 8 return area def judge(self): - runtime, throughput, energy, area, l1_size, l2_size, mac, power, num_pe = self.observation + (runtime, throughput, energy, area, l1_size, l2_size, mac, power, num_pe, _, _, _, _, + _, _) = self.observation def get_objective(objective): values = [] @@ -1095,11 +1247,11 @@ def get_objective(objective): reward = -l2_size elif term == "power": reward = -power - elif term =="ranking": + elif term == "ranking": reward = -1 - elif term =="L-PE-L2": + elif term == "L-PE-L2": reward = -runtime * num_pe * l2_size - elif term =="L-PE": + elif term == "L-PE": reward = -runtime * num_pe elif term == "PE": reward = -num_pe @@ -1110,22 +1262,27 @@ def get_objective(objective): return [float("-Inf")] * len(self.fitness_objective) values.append(reward) return values + values = get_objective(self.fitness_objective) return values - def print_indv(self, indv,fd=False): + def print_indv(self, indv, fd=False): for k in range(0, len(indv), 7): if fd: - fd.write("\n{}".format(indv[k:k+7])) + fd.write("\n{}".format(indv[k:k + 7])) else: - print(indv[k:k+7]) + print(indv[k:k + 7]) def init_arguement(self, dimension=None, stage_idx=0, prev_stage_value=0, num_population=100, num_generations=100, elite_ratio=0.05, - parents_ratio=0.15, ratio_decay=1, num_finetune=1, best_sol_1st=None, init_pop=None, uni_base=False, use_factor=False, use_pleteau=False,L1_bias_template=None): + parents_ratio=0.15, ratio_decay=1, num_finetune=1, best_sol_1st=None, init_pop=None, + uni_base=False, use_factor=False, use_pleteau=False, L1_bias_template=None): self.stage_idx = stage_idx self.num_generations = num_generations - self.num_population = num_population + if self.precision is None or self.precision == "FP32" or self.precision == "INT32": + self.num_population = num_population + else: + self.num_population = num_population*2 self.prev_stage_value = prev_stage_value self.ratio_decay = ratio_decay self.best_sol_1st = best_sol_1st @@ -1137,13 +1294,12 @@ def init_arguement(self, dimension=None, stage_idx=0, prev_stage_value=0, num_po self.best_activity = None self.best_sol = None self.stat_list = [] - self.uni_base =uni_base + self.uni_base = uni_base self.stat = None self.pleteau_sol = dict() self.use_factor = use_factor self.use_pleteau = use_pleteau self.best_reward_pleteau = None self.best_sol_pleteau = None - self.normalize=True if self.fitness_objective[0][:1] == "n" else False - self.L1_bias_template =L1_bias_template - + self.normalize = True if self.fitness_objective[0][:1] == "n" else False + self.L1_bias_template = L1_bias_template diff --git a/src/GAMMA/main.py b/src/GAMMA/main.py index 2d180cb..c023067 100644 --- a/src/GAMMA/main.py +++ b/src/GAMMA/main.py @@ -3,8 +3,8 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--fitness1', type=str, default="latency", choices=('latency', 'energy', 'power', 'EDP', 'area'), help='First objective') - parser.add_argument('--fitness2', type=str, default="energy", choices=('latency', 'energy', 'power', 'EDP', 'area'), help='Second objective') - parser.add_argument('--num_pop', type=int, default=20,help='Number of populations') + parser.add_argument('--fitness2', type=str, default="energy",choices=('latency', 'energy', 'power', 'EDP', 'area'), help='Second objective') + parser.add_argument('--num_pop', type=int, default=20 ,help='Number of populations') parser.add_argument('--parRS', default=False, action='store_true', help='Parallize across R S dimension') parser.add_argument('--epochs', type=int, default=2, help='Number of epochs (i.e., Numbers of generations)') parser.add_argument('--outdir', type=str, default="outdir", help='Output directiory') diff --git a/src/GAMMA/train.py b/src/GAMMA/train.py index bf901e2..f5ff699 100644 --- a/src/GAMMA/train.py +++ b/src/GAMMA/train.py @@ -4,6 +4,7 @@ import glob import os, sys +import cost_database as cdb script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '../')) @@ -84,18 +85,22 @@ def train_model(model_defs, input_arg, chkpt_file='./chkpt', precisions=None): l1_size=opt.l1_size, l2_size=opt.l2_size, pe_limit=opt.pe_limit, area_pebuf_only=False, external_area_model=True, map_cstr=map_cstr, - slevel_max=get_value_for_precision(precision), slevel_min=get_value_for_precision(precision)) + slevel_max=get_value_for_precision(precision), slevel_min=get_value_for_precision(precision), + precision=precision) #tolto calcolo su l1 value da verificare se rimettere chkpt, pops = env.run(dimension, stage_idx=0, num_population=opt.num_pop, prev_stage_value=None, num_generations=opt.epochs, best_sol_1st=None, init_pop=None, bias=None, uni_base=True, use_factor=opt.use_factor, - use_pleteau=False, precision=precision) + use_pleteau=False) best_sol = chkpt["best_sol"] - best_runtime, best_throughput, best_energy, best_area, best_l1_size, best_l2_size, best_mac, best_power, best_num_pe = env.get_indiv_info( - best_sol, num_pe=None, precision=precision) + (best_runtime, best_throughput, best_energy, best_area, best_l1_size, best_l2_size, best_mac, best_power, + best_num_pe, best_l1_read, best_l1_write, best_l2_read, best_l2_write, best_avg_pe, best_avg_bw) = \ + (env.get_indiv_info(best_sol, num_pe=None, precision=precision)) print("Mapping:", chkpt["best_sol"]) print( - f"Reward: {chkpt['best_reward'][0]:.3e}, Runtime: {best_runtime:.0f}(cycles), Area: {best_area / 1e6:.3f}(mm2), PE Area_ratio: {best_num_pe * MAC_AREA_INT8 / best_area * 100:.1f}%, Num_PE: {best_num_pe:.0f}, L1 Buffer: {best_l1_size:.0f}(elements), L2 Buffer: {best_l2_size:.0f}(elements)") + f"{num_layer}. Reward: {chkpt['best_reward'][0]:.3e}, Runtime: {best_runtime:.0f}(cycles), " + f"Area: {best_area / 1e6:.3f}(mm2), Num_PE: {best_num_pe:.0f}, L1 Buffer: {best_l1_size:.0f}(elements)," + f" L2 Buffer: {best_l2_size:.0f}(elements)") chkpt = { "reward": chkpt['best_reward'][0], "Best_solution": best_sol, @@ -103,13 +108,19 @@ def train_model(model_defs, input_arg, chkpt_file='./chkpt', precisions=None): "Throughput (MACs/Cycle)": best_throughput, "Activity count-based Energy (nJ)": best_energy, "Area": best_area, - "PE_Area_Ratio": best_num_pe * MAC_AREA_INT8 / best_area, "PE": best_num_pe, "PE_area": best_num_pe * MAC_AREA_INT8, "L1_area": best_l1_size * best_num_pe * BUF_AREA_perbit * 8, "L2_area": best_l2_size * BUF_AREA_perbit * 8, "L1_size": best_l1_size, - "L2_size": best_l2_size + "L2_size": best_l2_size, + "L1_read": best_l1_read, + "L1_write": best_l1_write, + "L2_read": best_l2_read, + "L2_write": best_l2_write, + "#MACs": best_mac, + "Avg #PE utilized": best_avg_pe, + "Avg BW": best_avg_bw } chkpt_list.append(chkpt) if opt.num_layer != 0: @@ -121,8 +132,9 @@ def train_model(model_defs, input_arg, chkpt_file='./chkpt', precisions=None): num_layer += 1 - columns = ["Runtime", "Throughput (MACs/Cycle)", "Activity count-based Energy (nJ)", "Area", "PE_Area_Ratio", "PE", "L1_size", "L2_size", "PE_area", "L1_area", "L2_area", - "Best_solution"] + columns = ["Runtime", "Throughput (MACs/Cycle)", "Activity count-based Energy (nJ)", "PE", + "L1_size", "L2_size", "L1_read", "L1_write", "L2_read", "L2_write", + "#MACs", "Avg #PE utilized", "Avg BW", "Best_solution"] np_array = None for chkpt in chkpt_list: if np_array is None: @@ -131,6 +143,42 @@ def train_model(model_defs, input_arg, chkpt_file='./chkpt', precisions=None): np_array = np.vstack( [np_array, np.array([chkpt[t] for t in columns[:-1]] + [f'{chkpt["Best_solution"]}']).reshape(1, -1)]) df = pd.DataFrame(np_array, columns=columns) + + if precisions is not None: + df['L1_size(bytes)'] = df.apply( + lambda row: convert_to_bytes(row['L1_size'], precisions[df.index.get_loc(row.name)]), + axis=1).astype(int) + # Add 'sram_size' to the DataFrame + df['L1_normalized_size'] = df['L1_size(bytes)'].apply(find_sram_size) + df['L1_read_energy'] = df.apply(lambda row: float(row['L1_read']) * cdb.get_sram_data(row['L1_normalized_size'], 'Read'), + axis=1) + df['L1_write_energy'] = df.apply( + lambda row: float(row['L1_write']) * cdb.get_sram_data(row['L1_normalized_size'], 'Write'), axis=1) + df['L2_size(bytes)'] = df.apply( + lambda row: convert_to_bytes(row['L2_size'], precisions[df.index.get_loc(row.name)]), axis=1).astype(int) + + df['L2_normalized_size'] = df['L2_size(bytes)'].apply(find_sram_size) + df['L2_read_energy'] = df.apply(lambda row: float(row['L2_read']) * cdb.get_sram_data(row['L2_normalized_size'], 'Read'), + axis=1) + df['L2_write_energy'] = df.apply( + lambda row: float(row['L2_write']) * cdb.get_sram_data(row['L2_normalized_size'], 'Write'), axis=1) + df['MAC_energy'] = df.apply(lambda row: cdb.get_energy(operation='MAC', precision=precisions[df.index.get_loc(row.name)]) * float(row['#MACs']), axis=1) + df['NoC_energy'] = df.apply(lambda row: cdb.calculate_noc_dyn_energy(precision=precisions[row.name], bw=row['Avg BW']), axis=1) + df['L1 energy'] = df['L1_read_energy'] + df['L1_write_energy'] + df['L2 energy'] = df['L2_read_energy'] + df['L2_write_energy'] + df['Activity count-based Energy (nJ)'] = df['Activity count-based Energy (nJ)'].astype(float).astype(int) + df['Runtime'] = df['Runtime'].astype(float).astype(int) + df['EDP'] = df['Activity count-based Energy (nJ)'] * df['Runtime'] + df['#MACs'] = df['#MACs'].astype(float).astype(int) + df['Runtime'] = df['Runtime'].astype(float).astype(int) + df['Activity count-based Energy (nJ)'] = df['Activity count-based Energy (nJ)'].astype('int') + df['PE'] = df['PE'].astype(float).astype(int) + df['L1_size'] = df['L1_size'].astype(float).astype(int) + df['L2_size'] = df['L2_size'].astype(float).astype(int) + df['L1_read'] = df['L1_read'].astype(float).astype(int) + df['L1_write'] = df['L1_write'].astype(float).astype(int) + df['L2_read'] = df['L2_read'].astype(float).astype(int) + df['L2_write'] = df['L2_write'].astype(float).astype(int) df.to_csv(chkpt_file[:-4] + ".csv", index_label="Layer") with open(chkpt_file, "wb") as fd: @@ -146,7 +194,7 @@ def get_cstr_name(mapping_cstr): def get_value_for_precision(precision): - if precision is None or precision == "FP32": + if precision is None or precision == "FP32" or precision == "INT32": return 2 else: return 3 @@ -159,12 +207,20 @@ def get_value_for_pe(precision, num_pe): return int(num_pe * 2) if precision == "FP8": return int(num_pe * 4) + if precision == "FP4": + return int(num_pe * 8) + if precision == "FP2": + return int(num_pe * 16) if precision == "INT32": return int(num_pe) if precision == "INT16": return int(num_pe * 2) if precision == "INT8": return int(num_pe * 4) + if precision == "INT4": + return int(num_pe * 8) + if precision == "INT2": + return int(num_pe * 16) def get_value_for_l1(precision, l1_size): @@ -203,3 +259,21 @@ def map_constraints(map_cstr, opt, precision): put_into_actual_cstr(costmodel_cstr, map_cstr) return map_cstr + + +def convert_to_bytes(size, precision): + if precision in cdb.precision_to_bits: + bits = cdb.precision_to_bits[precision] + return (int(float(size)) * bits) / 8 + else: + raise ValueError(f"Unknown precision: {precision}") + + +# Function to find the appropriate SRAM size +def find_sram_size(required_bytes): + for size in sorted(cdb.sram_data.keys()): + if size >= required_bytes: + return size + raise ValueError(f"No suitable SRAM size found for {required_bytes} bytes") + + diff --git a/src/Other_Blackbox_optimization/main.py b/src/Other_Blackbox_optimization/main.py index 128e738..581992d 100644 --- a/src/Other_Blackbox_optimization/main.py +++ b/src/Other_Blackbox_optimization/main.py @@ -29,9 +29,9 @@ def get_reward(env, indv): if opt.fixedCluster>0: - reward = env.oberserve_maestro(indv, sp2_sz=opt.fixedCluster) + reward = env.observe_maestro(indv, sp2_sz=opt.fixedCluster) else: - reward = env.oberserve_maestro(indv) + reward = env.observe_maestro(indv) if reward is None: reward = [-2**63] reward = reward[0] diff --git a/src/utils/constraint.py b/src/utils/constraint.py index 5aea439..0c86989 100644 --- a/src/utils/constraint.py +++ b/src/utils/constraint.py @@ -1,20 +1,24 @@ import random import numpy as np from collections import OrderedDict -dimension_set = {"K","C","R","S","Y","X"} + +dimension_set = {"K", "C", "R", "S", "Y", "X"} + + class Constraint(): - def __init__(self,num_pe=256): + def __init__(self, num_pe=256): self.lastcluster_dict = {} self.cstr_list = [None, None, None] self.num_free_order = 21 self.num_pe = num_pe + def set_constraint(self, level, cstr): if level == "L3": L = 2 if level == "L2": L = 1 if level == "L1": - L = 0 + L = 0 if self.cstr_list[L] is None: self.cstr_list[L] = OrderedDict(cstr) else: @@ -24,22 +28,24 @@ def set_constraint(self, level, cstr): self.cstr_list[L][key] = value def impose_constraint(self, indv, fixed_sp_sz=0): - num_levels = len(indv)//7 + num_levels = len(indv) // 7 index_offset = 0 self.lastcluster_dict = {} - for nl in range(num_levels-1,-1,-1): + for nl in range(num_levels - 1, -1, -1): self.set_valid_value_v2(self.cstr_list[nl], index_offset, indv, fixed_sp_sz=fixed_sp_sz) index_offset += 7 def create_from_constraint(self, indv, fixed_sp_sz=0, dimension_dict=None): - num_levels = len(indv)//7 + num_levels = len(indv) // 7 index_offset = 0 self.dimension_dict = dimension_dict self.lastcluster_dict = {} ret_num_free_order = 0 ret_num_free_par = 0 - for nl in range(num_levels-1,-1,-1): - num_free_order, num_free_par, free_orders,num_free_tile = self.set_valid_value_v2(self.cstr_list[nl], index_offset, indv, fixed_sp_sz=fixed_sp_sz) + for nl in range(num_levels - 1, -1, -1): + num_free_order, num_free_par, free_orders, num_free_tile = self.set_valid_value_v2(self.cstr_list[nl], + index_offset, indv, + fixed_sp_sz=fixed_sp_sz) index_offset += 7 ret_num_free_order += num_free_order ret_num_free_par += num_free_par @@ -58,10 +64,10 @@ def get_cstr_list(self, indv, fixed_sp_sz=0): ret_num_free_order, ret_num_free_par = self.create_from_constraint(indv, fixed_sp_sz, self.dimension_dict) return self.reverse_cstr_list(self.cstr_list), ret_num_free_order, ret_num_free_par - def set_valid_value(self, lever_cstr, index_offset, indv,fixed_sp_sz=0): + def set_valid_value(self, lever_cstr, index_offset, indv, fixed_sp_sz=0): for key, value in lever_cstr.items(): if key == "sp": - if indv[index_offset][0] not in value: + if indv[index_offset][0] not in value: sp = np.random.choice(value, 1)[0] indv[index_offset][0] = sp if len(self.lastcluster_dict) > 0: @@ -77,18 +83,14 @@ def set_valid_value(self, lever_cstr, index_offset, indv,fixed_sp_sz=0): valid_value = random.randint(1, indv[i][1]) self.lastcluster_dict[indv[i][0]] = indv[i][1] if key in dimension_set: - if valid_value > indv[change_idx][1]: + if valid_value > indv[change_idx][1]: indv[change_idx][1] = valid_value - - - - def set_valid_value_v2(self, lever_cstr, index_offset, indv, fixed_sp_sz=0): num_free_order = 6 - num_free_par = 1 + num_free_par = 1 num_free_tile = 6 - free_orders = {"X", "Y", "K","C","R","S"} + free_orders = {"X", "Y", "K", "C", "R", "S"} for key, value in lever_cstr.items(): if key == "sp": sp = np.random.choice(value, 1)[0] @@ -98,12 +100,14 @@ def set_valid_value_v2(self, lever_cstr, index_offset, indv, fixed_sp_sz=0): sp_sz = fixed_sp_sz else: if sp != indv[index_offset][0]: - sp_sz = random.randint(1,min(self.num_pe if self.num_pe>0 else float('Inf'), self.lastcluster_dict[sp])) if len(self.lastcluster_dict) >0 else self.dimension_dict[sp] + sp_sz = random.randint(1, min(self.num_pe if self.num_pe > 0 else float('Inf'), + self.lastcluster_dict[sp])) if len( + self.lastcluster_dict) > 0 else self.dimension_dict[sp] indv[index_offset] = [sp, sp_sz] num_free_par = 0 - elif key== "sp2": + elif key == "sp2": sp2 = np.random.choice(value, 1)[0] - if len(indv[index_offset])>2: + if len(indv[index_offset]) > 2: indv[index_offset][2] = sp2 else: indv[index_offset].append(sp2) @@ -111,7 +115,9 @@ def set_valid_value_v2(self, lever_cstr, index_offset, indv, fixed_sp_sz=0): if type(value) is int: indv[index_offset][1] = value else: - indv[index_offset][1] = random.randint(1,min(self.num_pe if self.num_pe>0 else float('Inf'), self.lastcluster_dict[value])) if len(self.lastcluster_dict) >0 else self.dimension_dict[value] + indv[index_offset][1] = random.randint(1, min(self.num_pe if self.num_pe > 0 else float('Inf'), + self.lastcluster_dict[value])) if len( + self.lastcluster_dict) > 0 else self.dimension_dict[value] elif key == "order": free_orders -= set(value) tile_dict = OrderedDict() @@ -121,9 +127,9 @@ def set_valid_value_v2(self, lever_cstr, index_offset, indv, fixed_sp_sz=0): num_free_order -= num_fixed_order for value_idx, i in enumerate(range(index_offset + 1 + num_free_order, index_offset + 7)): indv[i][0] = value[value_idx] - indv[i][1] = tile_dict[indv[i][0]] + indv[i][1] = tile_dict[indv[i][0]] tile_dict.pop(indv[i][0], None) - for i in range(index_offset + 1, index_offset + 1 + num_free_order): + for i in range(index_offset + 1, index_offset + 1 + num_free_order): indv[i][0], indv[i][1] = tile_dict.popitem(last=False) elif key in dimension_set: num_free_tile -= 1 @@ -135,16 +141,15 @@ def set_valid_value_v2(self, lever_cstr, index_offset, indv, fixed_sp_sz=0): else: if type(value) is list: left, right = value - right_num = self.lastcluster_dict[right] if len(self.lastcluster_dict) >0 else self.dimension_dict[right] + right_num = self.lastcluster_dict[right] if len(self.lastcluster_dict) > 0 else \ + self.dimension_dict[right] # indv[change_idx][1] = random.randint(left, right_num) - indv[change_idx][1] = right_num + indv[change_idx][1] = right_num else: - indv[change_idx][1] = self.lastcluster_dict[value] if len(self.lastcluster_dict) >0 else self.dimension_dict[value] + indv[change_idx][1] = self.lastcluster_dict[value] if len( + self.lastcluster_dict) > 0 else self.dimension_dict[value] break - - for i in range(index_offset + 1, index_offset + 7): self.lastcluster_dict[indv[i][0]] = indv[i][1] - return num_free_order, num_free_par, free_orders,num_free_tile - + return num_free_order, num_free_par, free_orders, num_free_tile