-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit_train_test.py
More file actions
88 lines (73 loc) · 3.18 KB
/
split_train_test.py
File metadata and controls
88 lines (73 loc) · 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!usr/env/bin python3
"""Preprocess the training data
author: @rayezh
"""
from xml.dom import ValidationErr
import pandas as pd
import os, pickle
from sklearn.model_selection import train_test_split, KFold
df_all = pd.read_csv('./dataset/cleaned_summary.csv')
df_all = df_all.dropna(subset = ["css","synergy_zip","synergy_bliss","synergy_loewe","synergy_hsa", "S"])
# split train and test by drug_col, drug_row and cell_line_name
def split_train_test(df, all_study):
'''
#test set 1: test within indications
*split by cell lines(balanced by indications)
'''
p1 = './dataset_split'
os.makedirs(p1, exist_ok = True)
# 1v1
for study_name in all_study:
print(study_name)
p2 = p1+'/'+study_name
os.makedirs(p2, exist_ok = True)
# test_train_split_by_combs
df_tmp = df.loc[df['study_name'] == study_name,:]
df_tmp.to_csv(p2+'/all.csv', index = False)
df_tmp['comb'] = ['_'.join(sorted([r.drug_col,r.drug_row])+[r.cell_line_name]) for _,r in df_tmp.iterrows()]
all_comb = sorted(set(df_tmp.comb))
print("all experiments:", df_tmp.shape[0])
print("all combination treatment-cell line combinations:", len(all_comb))
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
i = 0
#5-fold cross validation
for train, test in kf.split(all_comb):
p3 = p2+'/fold_'+str(i)
os.makedirs(p3, exist_ok = True)
i+=1
train_f = [all_comb[j] for j in train]
test_f = [all_comb[j] for j in test]
train_df = df_tmp[df_tmp['comb'].isin(train_f)]
test_df = df_tmp[df_tmp['comb'].isin(test_f)]
train_df.to_csv(p3+'/train.csv', index = False)
test_df.to_csv(p3+'/test.csv', index = False)
# 3v1
for study_name in all_study:
train_study = [s for s in all_study if s != study_name]
print(train_study)
# train: 3 study
p2 = p1+'/'+'_'.join(sorted(train_study))
os.makedirs(p2, exist_ok = True)
# test_train_split_by_combs
df_tmp = df.loc[df['study_name'].isin(train_study),:]
df_tmp.to_csv(p2+'/all.csv', index = False)
df_tmp['comb'] = ['_'.join(sorted([r.drug_col,r.drug_row])+[r.cell_line_name]) for _,r in df_tmp.iterrows()]
all_comb = sorted(set(df_tmp.comb))
print("all experiments:", df_tmp.shape[0])
print("all combination treatment-cell line combinations:", len(all_comb))
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
i = 0
#5-fold cross validation
for train, test in kf.split(all_comb):
p3 = p2+'/fold_'+str(i)
os.makedirs(p3, exist_ok = True)
i+=1
train_f = [all_comb[j] for j in train]
test_f = [all_comb[j] for j in test]
train_df = df_tmp[df_tmp['comb'].isin(train_f)]
test_df = df_tmp[df_tmp['comb'].isin(test_f)]
train_df.to_csv(p3+'/train.csv', index = False)
test_df.to_csv(p3+'/test.csv', index = False)
# inter-study train and split
all_study = ["ONEIL", "ALMANAC", "FORCINA", "Mathews"]
split_train_test(df_all, all_study)