1+ #!/usr/bin/env python3
2+ # -*- coding: utf-8 -*-
3+
4+ """
5+ GAPBS和其他工作负载自动化运行脚本
6+ 运行所有程序并将日志保存到artifact文件夹的相应位置
7+ """
8+
9+ import os
10+ import subprocess
11+ import time
12+ import argparse
13+ import logging
14+ import shutil
15+ import itertools
16+ from pathlib import Path
17+ from datetime import datetime
18+
19+ # 配置日志格式
20+ logging .basicConfig (
21+ level = logging .INFO ,
22+ format = '%(asctime)s - %(levelname)s - %(message)s' ,
23+ datefmt = '%Y-%m-%d %H:%M:%S'
24+ )
25+ logger = logging .getLogger (__name__ )
26+
27+ # 定义策略选项
28+ ALLOCATION_POLICIES = ["none" , "interleave" , "numa" ]
29+ MIGRATION_POLICIES = ["none" , "heataware" , "frequency" , "loadbalance" , "locality" , "lifetime" , "hybrid" ]
30+ PAGING_POLICIES = ["none" , "hugepage" , "pagetableaware" ]
31+ CACHING_POLICIES = ["none" , "fifo" , "frequency" ]
32+
33+ # 定义基本路径
34+ ARTIFACT_BASE = "../artifact"
35+ CXL_MEM_SIM = "./CXLMemSim"
36+
37+ # 定义工作负载配置
38+ WORKLOADS = {
39+ "gapbs" : {
40+ "path" : "../workloads/gapbs" ,
41+ "programs" : [
42+ "bc" , "bfs" , "cc" , "pr" , "sssp" , "tc" # GAPBS提供的所有算法
43+ ],
44+ "args" : "-g 16 -n 1" , # 默认参数
45+ "env" : {} # 默认环境变量
46+ },
47+ "memcached" : {
48+ "path" : "./workloads/memcached" ,
49+ "programs" : ["memcached" ],
50+ "args" : "-u try" ,
51+ "env" : {}
52+ },
53+ "llama" : {
54+ "path" : "../workloads/llama.cpp/build/bin" ,
55+ "programs" : ["llama-cli" ],
56+ "args" : "--model ../workloads/llama.cpp/build/DeepSeek-R1-Distill-Qwen-32B-Q2_K.gguf --cache-type-k q8_0 --threads 16 --prompt '<|User|>What is 1+1?<|Assistant|>' -no-cnv" ,
57+ "env" : {}
58+ },
59+ "gromacs" : {
60+ "path" : "../workloads/gromacs/build/bin" ,
61+ "programs" : ["gmx" ],
62+ "args" : "mdrun -s ../workloads/gromacs/build/topol.tpr -nsteps 1000" ,
63+ "env" : {}
64+ },
65+ "vsag" : {
66+ "path" : "/usr/bin/" ,
67+ "programs" : ["python3" ],
68+ "args" : "run_algorithm.py --dataset random-xs-20-angular --algorithm vsag --module ann_benchmarks.algorithms.vsag --constructor Vsag --runs 2 --count 10 --batch '[\' angular\' , 20, {\' M\' : 24, \' ef_construction\' : 300, \' use_int8\' : 4, \' rs\' : 0.5}]' '[10]' '[20]' '[30]' '[40]' '[60]' '[80]' '[120]' '[200]' '[400]' '[600]' '[800]'" ,
69+ "env" : {}
70+ },
71+ "microbench" : {
72+ "path" : "./microbench" ,
73+ "programs" : ["ld" , "st" , "ld_serial" , "st_serial" , "malloc" , "writeback" ],
74+ "args" : "" ,
75+ "env" : {}
76+ }
77+ }
78+ def ensure_directory (path ):
79+ """确保目录存在,如果不存在则创建"""
80+ os .makedirs (path , exist_ok = True )
81+ return path
82+
83+ def run_command (cmd , log_path = None , env = None , timeout = 3600 , shell = False ):
84+ """
85+ 运行命令并捕获输出
86+
87+ 参数:
88+ cmd (list或str): 命令列表或字符串
89+ log_path (str, optional): 日志保存路径
90+ env (dict, optional): 环境变量
91+ timeout (int, optional): 超时时间(秒)
92+ shell (bool, optional): 是否使用shell执行
93+
94+ 返回:
95+ tuple: (返回码, 输出)
96+ """
97+ cmd_str = cmd if isinstance (cmd , str ) else ' ' .join (cmd )
98+ logger .info (f"运行命令: { cmd_str } " )
99+
100+ # 准备环境变量
101+ run_env = os .environ .copy ()
102+ if env :
103+ run_env .update (env )
104+
105+ try :
106+ # 运行命令
107+ process = subprocess .run (
108+ cmd ,
109+ stdout = subprocess .PIPE ,
110+ stderr = subprocess .STDOUT ,
111+ env = run_env ,
112+ text = True ,
113+ timeout = timeout ,
114+ check = False ,
115+ shell = shell
116+ )
117+
118+ # 记录输出
119+ output = process .stdout
120+ if log_path :
121+ with open (log_path , 'w' ) as f :
122+ f .write (output )
123+ logger .info (f"输出已保存到: { log_path } " )
124+
125+ return process .returncode , output
126+
127+ except subprocess .TimeoutExpired :
128+ logger .error (f"命令执行超时({ timeout } 秒): { cmd_str } " )
129+ if log_path :
130+ with open (log_path , 'w' ) as f :
131+ f .write (f"TIMEOUT: Command timed out after { timeout } seconds\n " )
132+ return - 1 , f"TIMEOUT: Command timed out after { timeout } seconds"
133+
134+ except Exception as e :
135+ logger .error (f"命令执行失败: { e } " )
136+ if log_path :
137+ with open (log_path , 'w' ) as f :
138+ f .write (f"ERROR: { str (e )} \n " )
139+ return - 2 , f"ERROR: { str (e )} "
140+
141+ def run_original (workload , program , args , base_dir ):
142+ """运行原始程序"""
143+ program_path = os .path .join (WORKLOADS [workload ]["path" ], program )
144+ log_path = os .path .join (base_dir , "orig.txt" )
145+
146+ # 构建命令
147+ cmd = f"{ program_path } { args } " if args else program_path
148+
149+ # 运行命令
150+ return run_command (cmd , log_path , WORKLOADS [workload ].get ("env" ), shell = True )
151+
152+ def run_cxl_mem_sim (workload , program , args , base_dir , policy_combo = None , pebs_period = 10 ,
153+ latency = "200,250,200,250,200,250" , bandwidth = "50,50,50,50,50,50" ):
154+ """使用CXLMemSim运行程序"""
155+ program_path = os .path .join (WORKLOADS [workload ]["path" ], program )
156+
157+ # 如果提供了策略组合,创建特定的日志文件名
158+ if policy_combo :
159+ policy_str = '_' .join (policy_combo )
160+ log_path = os .path .join (base_dir , f"cxlmemsim_{ policy_str } .txt" )
161+ else :
162+ log_path = os .path .join (base_dir , "cxlmemsim.txt" )
163+
164+ # 构建带参数的命令
165+ target_with_args = f"{ program_path } { args } " if args else program_path
166+
167+ # 构建基本CXLMemSim命令
168+ cmd = [
169+ CXL_MEM_SIM ,
170+ "-t" , target_with_args ,
171+ "-p" , str (pebs_period ),
172+ "-l" , latency ,
173+ "-b" , bandwidth
174+ ]
175+
176+ # 添加策略参数(如果提供)
177+ if policy_combo :
178+ cmd .extend (["-k" , "," .join (policy_combo )])
179+
180+ # 运行命令
181+ return run_command (cmd , log_path , WORKLOADS [workload ].get ("env" ))
182+
183+ def generate_policy_combinations (args ):
184+ """生成策略组合"""
185+ # 使用指定的策略或默认值
186+ allocation_policies = args .allocation_policies if args .allocation_policies else ["none" ]
187+ migration_policies = args .migration_policies if args .migration_policies else ["none" ]
188+ paging_policies = args .paging_policies if args .paging_policies else ["none" ]
189+ caching_policies = args .caching_policies if args .caching_policies else ["none" ]
190+
191+ # 生成所有组合
192+ return list (itertools .product (allocation_policies , migration_policies , paging_policies , caching_policies ))
193+
194+ def run_all_workloads (args ):
195+ """运行所有工作负载"""
196+ start_time = time .time ()
197+ logger .info ("开始运行所有工作负载" )
198+
199+ # 创建artifact总目录
200+ ensure_directory (ARTIFACT_BASE )
201+
202+ # 记录系统信息
203+ if args .collect_system_info :
204+ logger .info ("收集系统信息" )
205+ run_command (["dmesg" ], os .path .join (ARTIFACT_BASE , "dmesg.txt" ))
206+ run_command (["dmidecode" ], os .path .join (ARTIFACT_BASE , "dmidecode.txt" ))
207+ run_command (["lspci" , "-vvv" ], os .path .join (ARTIFACT_BASE , "lspci.txt" ))
208+
209+ # 生成策略组合(如果需要)
210+ policy_combinations = generate_policy_combinations (args ) if args .run_policy_combinations else [None ]
211+ logger .info (f"将使用 { len (policy_combinations )} 种策略组合运行测试" )
212+
213+ # 遍历所有工作负载
214+ for workload_name , workload_config in WORKLOADS .items ():
215+ if args .workloads and workload_name not in args .workloads :
216+ logger .info (f"跳过工作负载: { workload_name } " )
217+ continue
218+
219+ logger .info (f"开始处理工作负载: { workload_name } " )
220+
221+ # 遍历工作负载中的所有程序
222+ for program in workload_config ["programs" ]:
223+ if args .programs and program not in args .programs :
224+ logger .info (f"跳过程序: { program } " )
225+ continue
226+
227+ logger .info (f"开始处理程序: { program } " )
228+
229+ # 创建程序的目录
230+ program_dir = ensure_directory (os .path .join (ARTIFACT_BASE , workload_name , program ))
231+
232+ # 运行原始程序(如果需要)
233+ if args .run_original :
234+ logger .info (f"运行原始程序: { program } " )
235+ returncode , _ = run_original (
236+ workload_name ,
237+ program ,
238+ workload_config ["args" ],
239+ program_dir
240+ )
241+ if returncode != 0 and not args .ignore_errors :
242+ logger .error (f"原始程序运行失败: { program } , 返回码: { returncode } " )
243+ if args .stop_on_error :
244+ return
245+
246+ # 运行CXLMemSim(如果需要)
247+ if args .run_cxlmemsim :
248+ # 遍历所有策略组合
249+ for policy_combo in policy_combinations :
250+ if policy_combo :
251+ policy_str = ', ' .join (policy_combo )
252+ logger .info (f"使用策略组合 [{ policy_str } ] 运行 CXLMemSim: { program } " )
253+ else :
254+ logger .info (f"使用默认策略运行 CXLMemSim: { program } " )
255+
256+ returncode , _ = run_cxl_mem_sim (
257+ workload_name ,
258+ program ,
259+ workload_config ["args" ],
260+ program_dir ,
261+ policy_combo ,
262+ args .pebs_period ,
263+ args .latency ,
264+ args .bandwidth
265+ )
266+ if returncode != 0 and not args .ignore_errors :
267+ logger .error (f"CXLMemSim运行失败: { program } , 返回码: { returncode } " )
268+ if args .stop_on_error :
269+ return
270+
271+ # 完成所有工作负载
272+ end_time = time .time ()
273+ elapsed_time = end_time - start_time
274+ hours , remainder = divmod (elapsed_time , 3600 )
275+ minutes , seconds = divmod (remainder , 60 )
276+
277+ logger .info (f"所有工作负载运行完成,总耗时: { int (hours )} 时{ int (minutes )} 分{ int (seconds )} 秒" )
278+
279+ # 将运行日志保存到artifact目录
280+ if args .log_file :
281+ shutil .copy2 (args .log_file , os .path .join (ARTIFACT_BASE , "run.log" ))
282+
283+ def main ():
284+ parser = argparse .ArgumentParser (description = "GAPBS和其他工作负载自动化运行脚本" )
285+ parser .add_argument ("--workloads" , nargs = "+" , help = "要运行的工作负载,默认运行所有" )
286+ parser .add_argument ("--programs" , nargs = "+" , help = "要运行的程序,默认运行所有" )
287+ parser .add_argument ("--run-original" , action = "store_true" , help = "运行原始程序" )
288+ parser .add_argument ("--run-cxlmemsim" , action = "store_true" , help = "使用CXLMemSim运行程序" )
289+ parser .add_argument ("--collect-system-info" , action = "store_true" , help = "收集系统信息" )
290+
291+ # CXLMemSim参数
292+ parser .add_argument ("--pebs-period" , type = int , default = 10 , help = "PEBS采样周期" )
293+ parser .add_argument ("--latency" , default = "200,250,200,250,200,250" , help = "CXLMemSim延迟设置" )
294+ parser .add_argument ("--bandwidth" , default = "50,50,50,50,50,50" , help = "CXLMemSim带宽设置" )
295+
296+ # 策略组合参数
297+ parser .add_argument ("--run-policy-combinations" , action = "store_true" , help = "运行策略组合测试" )
298+ parser .add_argument ("--allocation-policies" , nargs = "+" , choices = ALLOCATION_POLICIES ,
299+ help = f"分配策略选项: { ', ' .join (ALLOCATION_POLICIES )} " ,default = ALLOCATION_POLICIES )
300+ parser .add_argument ("--migration-policies" , nargs = "+" , choices = MIGRATION_POLICIES ,
301+ help = f"迁移策略选项: { ', ' .join (MIGRATION_POLICIES )} " ,default = MIGRATION_POLICIES )
302+ parser .add_argument ("--paging-policies" , nargs = "+" , choices = PAGING_POLICIES ,
303+ help = f"分页策略选项: { ', ' .join (PAGING_POLICIES )} " ,default = PAGING_POLICIES )
304+ parser .add_argument ("--caching-policies" , nargs = "+" , choices = CACHING_POLICIES ,
305+ help = f"缓存策略选项: { ', ' .join (CACHING_POLICIES )} " ,default = CACHING_POLICIES )
306+
307+ # 错误处理
308+ parser .add_argument ("--ignore-errors" , action = "store_true" , help = "忽略错误继续运行" )
309+ parser .add_argument ("--stop-on-error" , action = "store_true" , help = "遇到错误时停止运行" )
310+ parser .add_argument ("--log-file" , default = "run.log" , help = "运行日志文件" )
311+ parser .add_argument ("--timeout" , type = int , default = 3600 , help = "命令超时时间(秒)" )
312+
313+ args = parser .parse_args ()
314+
315+ # 设置日志文件
316+ file_handler = logging .FileHandler (args .log_file )
317+ file_handler .setFormatter (logging .Formatter ('%(asctime)s - %(levelname)s - %(message)s' ))
318+ logger .addHandler (file_handler )
319+
320+ # 如果没有指定运行类型,默认都运行
321+ if not args .run_original and not args .run_cxlmemsim :
322+ args .run_original = True
323+ args .run_cxlmemsim = True
324+
325+ # 运行所有工作负载
326+ run_all_workloads (args )
327+
328+ if __name__ == "__main__" :
329+ main ()
0 commit comments