-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest.py
More file actions
169 lines (147 loc) · 5.7 KB
/
test.py
File metadata and controls
169 lines (147 loc) · 5.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# functions used to create the model
import utils
import os
from random import sample
import sys
from math import exp
empty_row = [-1] * 20
acids_list = ['A', 'C', 'E', 'D', 'G', 'I', 'H', 'K', 'F', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y']
# Get the parent directory of this code
this_script = os.path.abspath(__file__)
parent_directory = os.path.dirname(this_script)
# Set the directory for testing output
rr_output_directory = parent_directory + "/testing-rr-output"
def main(pssm_files, pssm_dir, rr_dir):
test(pssm_files, pssm_dir, rr_dir)
accuracy(rr_dir)
def test(pssm_files, pssm_dir, rr_dir):
"""
Test the model. Generate contact probabilities and save them in .rr format sorted in descending order.
"""
model = utils.read_model()
rr_rows = []
# create directory for .rr files created by testing
try:
os.mkdir(rr_output_directory)
except FileExistsError:
print("Removing old .rr files...")
old_rr_files = utils.read_directory_contents(rr_output_directory, '.rr')
for old_rr_file in old_rr_files:
os.remove(os.path.join(rr_output_directory, old_rr_file))
# test
print("Testing...")
for pssm_file in pssm_files:
# get the feature values of every pair
test_matrix = build_test_matrix(pssm_file, pssm_dir)
# get the sequence associated with this pssm
sequence = utils.read_rr(pssm_file.replace('.pssm', '.rr'), rr_dir)['sequence']
for pair in range(len(test_matrix)):
contact_probability = calculate_contact_probability(model, test_matrix[pair])
# create row
i, j = list(test_matrix[pair].keys())[0][0], list(test_matrix[pair].keys())[0][1]
rr_row = [i, j, 0, 8, contact_probability]
# rr_rows.append(" ".join([str(x) for x in rr_row]))
rr_rows.append(rr_row)
# sort rows
rr_rows.sort(key = lambda row: row[4], reverse = True)
# write to file
file_name = os.path.join(rr_output_directory, pssm_file.replace('.pssm', '.rr'))
with open(file_name, 'w') as file:
file.write(sequence + "\n")
# stringify all rows and put in file
for row in rr_rows:
file.write(" ".join([str(x) for x in row]) + "\n")
def accuracy(rr_dir):
"""
Determine the average accuracy of the L/10, L/5, and L/2 most probable contact predictions.
"""
# metrics
l10_acc, l5_acc, l2_acc = 0.0, 0.0, 0.0
# get the .rr files created in testing (predictions)
rr_outputs = utils.read_directory_contents(rr_output_directory, '.rr')
# determine the accuracy of each individual .rr file
num_files = 0
for rr_output in rr_outputs:
num_files += 1
# create a list of the predictions
predictions = utils.read_rr(rr_output, rr_output_directory)
predictions_list = listify_rr(predictions)
predictions_list.sort(key = lambda row: row[2], reverse = True)
# create a list of the actual contact pairs
contact_pairs = utils.read_rr(rr_output, rr_dir)
contact_pairs_list = listify_rr(contact_pairs)
# metrics
l10_denom, l10_num, l5_denom, l5_num, l2_denom, l2_num = 0, 0, 0, 0, 0, 0
L = len(contact_pairs['sequence'])
for i in range(int(L / 2)):
# make prediction
predicted_contact = predictions_list[i][2] > 0.5
actual_contact = (predictions_list[i][0], predictions_list[i][1]) in contact_pairs
# update metrics
if i <= L / 10 and actual_contact:
l10_denom += 1
if predicted_contact:
l10_num += 1
if i <= L / 5 and actual_contact:
l5_denom += 1
if predicted_contact:
l5_num += 1
if actual_contact:
l2_denom += 1
if predicted_contact:
l2_num += 1
# aggregate
if l10_denom != 0:
l10_acc += l10_num / l10_denom
if l5_denom != 0:
l5_acc += l5_num / l5_denom
if l2_denom != 0:
l2_acc += l2_num / l2_denom
# average
l10_acc /= num_files
l5_acc /= num_files
l2_acc /= num_files
print("Accuracy")
print("--------")
print("L10: " + str(l10_acc))
print("L5: " + str(l5_acc))
print("L2: " + str(l2_acc))
def listify_rr(rr):
pair_keys = [key for key in rr.keys() if key != 'sequence']
return [[pair[0], pair[1], rr[pair]] for pair in pair_keys]
def calculate_contact_probability(model, pair):
"""
:return: the probability that the given pair is in contact
"""
feature_values = list(pair.values())[0]
n = exp(model[0] + sum([model[i + 1] * feature_values[i] for i in range(200)]))
return n / (1 + n)
def build_test_matrix(pssm_file, pssm_dir):
"""
:return: a list of dictionaries with key = (i, j) and value = a list of feature values
"""
pssm = utils.read_pssm(pssm_file, pssm_dir)
test_matrix = []
for i in range(len(pssm)):
for j in range(i + 5, len(pssm)):
feature = {}
row = get_five(pssm, i)
row.extend(get_five(pssm, j))
feature[(i, j)] = row
test_matrix.append(feature)
return test_matrix
def get_five(pssm, i):
"""
:return: a list of the 100 feature values in the sliding window around i
"""
values = []
for row_offset in range(-2, 3):
if i + row_offset < 0:
values.extend(empty_row)
elif i + row_offset >= len(pssm):
values.extend(empty_row)
else:
values.extend([pssm[i + row_offset][k] for k in acids_list])
return values
if __name__ == "__main()__":
main()