-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathattack.py
More file actions
132 lines (113 loc) · 5.69 KB
/
Copy pathattack.py
File metadata and controls
132 lines (113 loc) · 5.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import json
from utils import utils as my_utilities
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import argparse
TARGET_TASK_PATH = 'data/target_37.json'
INJECTED_TASK_PATH = 'data/injected_questions_with_policy_185.csv'
ATTACK_INSTRUCTION_PATH = 'data/attack_20.csv'
GCG_STRING_PATH = 'data/gcg_string.json'
_gcg_string_dict = None
def get_gcg_attack(injected_task):
global _gcg_string_dict
if _gcg_string_dict is None:
with open(GCG_STRING_PATH, 'r') as f:
_gcg_string_dict = json.load(f)
return injected_task + _gcg_string_dict[injected_task]
# create attack method + injected_task
def get_attack(method, injected_task, attack_df):
# print('Method: ', method)
attack = attack_df[attack_df['topic'] == method]['example'].values[0]
attack_add_injected_task = attack.replace("{instruction}", injected_task)
return attack_add_injected_task
# create target task + attack method + injected_task
def generate_instruction(target_task_name, attack_method, injected_task, target_tasks, attack_df):
target_task = target_tasks[target_task_name]
if attack_method == 'gcg':
return str(target_task['instruction']), str(target_task['query']['1']) + get_gcg_attack(injected_task)
return str(target_task['instruction']), str(target_task['query']['1']) + get_attack(attack_method, injected_task, attack_df)
def fill_single_cell(idx, col, attack_method, llm, target_tasks, attack_df):
# idx is the name of target task
# col is the injected question
system_prompt, user_prompt = generate_instruction(idx, attack_method, col, target_tasks, attack_df)
messages = [
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': user_prompt}
]
answer = my_utilities.llm_call_message(llm, messages)
return (idx, col, answer)
SAVE_INTERVAL = 10 # save every N completed tasks
def attack(attack_method, llm, num_workers, target_task_path, save_dir,
injected_task_path=INJECTED_TASK_PATH, attack_instruction_path=ATTACK_INSTRUCTION_PATH):
print('Loading target tasks...')
with open(target_task_path, 'r') as f:
target_tasks = json.load(f)
print('Loading attack templates...')
attack_df = pd.read_csv(attack_instruction_path)
print('Loading injected questions...')
injected_df = pd.read_csv(injected_task_path)
injected_instr = injected_df['instruction'].tolist()
result_file_path = f'{save_dir}/{llm}/answer/{attack_method}.csv'
if os.path.exists(result_file_path):
print(f'Resume from last time: {result_file_path}')
result_df = pd.read_csv(result_file_path, index_col=0)
else:
print(f'Create new file: {result_file_path}')
result_df = pd.DataFrame(columns=injected_instr, index=target_tasks.keys())
result_df.to_csv(result_file_path, index=True)
for col in result_df.columns:
result_df[col] = result_df[col].astype(object)
with ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = []
print(f'Processing target tasks...{result_df.index}')
for idx in result_df.index:
for col in result_df.columns:
if pd.isna(result_df.at[idx, col]):
futures.append(executor.submit(fill_single_cell, idx, col, attack_method, llm, target_tasks, attack_df))
completed_count = 0
for future in tqdm(as_completed(futures), total=len(futures)):
try:
idx, col, answer = future.result()
result_df.at[idx, col] = answer
completed_count += 1
if completed_count % SAVE_INTERVAL == 0:
result_df.to_csv(result_file_path, index=True)
except KeyError as e:
print(f"[KeyError] Raised in future: {future}")
print(f"Exception: {e}")
except Exception as e:
print(f"[Unhandled Exception] in future: {future}")
print(f"Exception: {e}")
result_df.to_csv(result_file_path, index=True) # final save
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-a', '--attack_method', type=str, default='combined_attack')
parser.add_argument('-m', '--llm', type=str, default='llama3')
parser.add_argument('-w', '--num_workers', type=int, default=10)
parser.add_argument('-s', '--save_dir', type=str, default='result')
parser.add_argument('--target_task_path', type=str, default=TARGET_TASK_PATH)
parser.add_argument('--injected_task_path', type=str, default=INJECTED_TASK_PATH)
parser.add_argument('--attack_all_path', type=str, default=ATTACK_INSTRUCTION_PATH)
parser.add_argument('--gcg_string_path', type=str, default=GCG_STRING_PATH)
args = parser.parse_args()
llm = args.llm
attack_method = args.attack_method
num_workers = args.num_workers
target_task_path = args.target_task_path
save_dir = args.save_dir
GCG_STRING_PATH = args.gcg_string_path
print(f'parameters: {args}')
if attack_method == 'all':
attack_methods = my_utilities.attack_list
for attack_method in attack_methods:
print(f'Attack method: {attack_method}, LLM: {llm}, target task: {target_task_path}')
attack(attack_method, llm, num_workers, target_task_path, save_dir,
args.injected_task_path, args.attack_all_path)
else:
print(f'Attack method: {attack_method}, LLM: {llm}, target task: {target_task_path}')
attack(attack_method, llm, num_workers, target_task_path, save_dir,
args.injected_task_path, args.attack_all_path)
if llm in my_utilities.GPT_PRICING:
my_utilities.print_cost_summary(llm)