-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain.py
More file actions
215 lines (170 loc) · 6.64 KB
/
train.py
File metadata and controls
215 lines (170 loc) · 6.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# functions used to train and create the model
import utils
import os
from random import sample
import sys
from math import exp, log
import test
import classify
empty_row = {'A': -1, 'C': -1, 'E': -1, 'D': -1, 'G': -1, 'I': -1, 'H': -1, 'K': -1, 'F': -1, 'M': -1, 'L': -1, 'N': -1, 'Q': -1, 'P': -1, 'S': -1, 'R': -1, 'T': -1, 'W': -1, 'V': -1, 'Y': -1}
acids_list = ['A', 'C', 'E', 'D', 'G', 'I', 'H', 'K', 'F', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y']
# Get the parent directory of this code
this_script = os.path.abspath(__file__)
parent_directory = os.path.dirname(this_script)
# Global variables
SAMPLE_SIZE = 10
STEP_SIZE = 0.001
def main():
# Read in PSSM/RR files
pssm_list, rr_list, pssm_dir, rr_dir = parse_args()
pssm_train = sample(pssm_list, int(0.75 * len(pssm_list)))
pssm_test = [pssm for pssm in pssm_list if pssm not in pssm_train]
train(pssm_train, pssm_dir, rr_dir)
test.main(pssm_test, pssm_dir, rr_dir)
classify.main(pssm_test, pssm_dir, rr_dir)
def build_feature_matrix(pssm_files, pssm_dir, rr_dir):
"""
Builds a feature matrix based on PSSM and RR files
:return: List of dictionaries. Keys are feature numbers(0 - 199) and 'class'
"""
print('Building feature matrix...')
feature_matrix = []
for pssm_file in pssm_files:
# For each training file, read in the PSSM matrix and the RR file
pssm = utils.read_pssm(pssm_file, pssm_dir)
rr = utils.read_rr(pssm_file.replace('.pssm', '.rr'), rr_dir)
intermediate_matrix = build_small_matrix(pssm)
# Get a random sample of (i, j) pairs to balance data
pairs = [(i, j) for i in range(len(intermediate_matrix)) for j in range(i+5, len(intermediate_matrix)) if (i, j) not in rr]
pairs = sample(pairs, len(rr))
def build_row(class_label):
# Build row for feature matrix:
feature_row = {'class': class_label}
feature_row.update(intermediate_matrix[i])
for feat_num in range(len(intermediate_matrix[j])):
feature_row[100 + feat_num] = intermediate_matrix[j][feat_num]
# feature_matrix.append(feature_row)
return feature_row
# For each i, j pair, make a row for the feature matrix
for i, j in pairs:
feature_matrix.append(build_row(0))
pair_keys = [key for key in rr.keys() if key != 'sequence']
for i, j in pair_keys:
feature_matrix.append(build_row(1))
return feature_matrix
def build_small_matrix(pssm):
"""
Builds the 'small' intermediate matrix. Rows are 100 features from sliding window of size 5.
:return: List of dictionaries. Keys are feature numbers (0 - 99)
"""
small_matrix = []
for row_num in range(len(pssm)):
# For each amino acid in the PSSM, build a line for the feature matrix
feature = {}
for row_offset in range(-2, 3):
if row_num + row_offset < 0:
# We're at the top of the PSSM
values = empty_row
elif row_num + row_offset >= len(pssm):
# We're at the bottom of the PSSM
values = empty_row
else:
# We're somewhere in the middle
values = pssm[row_num + row_offset]
for val_num, acid in enumerate(acids_list):
feature[((row_offset + 2) * 20) + val_num] = values[acid]
small_matrix.append(feature)
return small_matrix
def train(pssm_list, pssm_dir, rr_dir):
"""
Train the model using gradient ascent. Save the model.
"""
# Build feature matrix
feature_matrix = build_feature_matrix(pssm_list, pssm_dir, rr_dir)
w_vector = new_w_vector()
gradient_vector = None
print('Training the model...')
while not reached_top(w_vector, gradient_vector):
gradient_vector = calc_gradient(w_vector, feature_matrix)
w_vector = update_w(w_vector, gradient_vector)
# Save the model to the file
utils.write_model(w_vector)
def calc_gradient(w_vector, matrix):
"""
Calculates the gradient based on (SAMPLE_SIZE) training examples
:return: gradient vector
"""
gradient_vector = [0] * len(w_vector)
training_data = sample(matrix, SAMPLE_SIZE)
for training_example in training_data:
# Calculate P(Y=1|X,w)
p_hat = 1.0 / (1 + exp(calc_sum(w_vector, training_example)))
# Deal with w0
gradient_vector[0] += training_example['class'] - p_hat
# For each feature, calculate the gradient
for i in range(1, len(w_vector)):
gradient_vector[i] += training_example[i-1] * (training_example['class'] - p_hat)
return gradient_vector
def update_w(w_vector, gradient_vector):
"""
Updates each w value in the w_vector
:return: w_vector
"""
for index in range(len(w_vector)):
w_vector[index] += STEP_SIZE * gradient_vector[index]
return w_vector
def reached_top(w_vector, gradient_vector):
"""
Check if we've reached the top of the mountain
:return: boolean
"""
if not gradient_vector:
return False
for i in range(len(gradient_vector)):
if gradient_vector[i] > 0.05:
return False
print('Reached the top!')
return True
def calc_max_conditional_likelihood(w_vector, feature_matrix):
sum_mcl = 0.0
for feature in feature_matrix:
feature_sum = calc_sum(w_vector, feature)
sum_mcl += feature['class'] * feature_sum - log(1 + exp(feature_sum))
return sum_mcl
def new_w_vector():
"""
Make a new w vector.
TODO: Random values for 'base camp'?
:return: list of length 201
"""
return [1] * 201
def calc_sum(w_vector, feature_vector):
"""
Calculates the sum w0 + SUM(wi * xi)
"""
sum_of_w = w_vector[0]
for i in range(1, len(w_vector)):
sum_of_w += w_vector[i] * feature_vector[i - 1]
return sum_of_w
err_msg = '''
Please enter two directory names (absolute paths)
containing sequences for linear regression training data
(with double quotes around them if they have spaces).
The directory with PSSM files should come first,
followed by the path to the .rr files.'''
def parse_args():
if len(sys.argv) < 3:
print(err_msg)
sys.exit()
try:
# Get the lists of pssm and rr file names
pssm = utils.read_directory_contents(sys.argv[1], '.pssm')
rr = utils.read_directory_contents(sys.argv[2], '.rr')
except:
# Given paths are not valid directories
print(err_msg)
sys.exit()
# Return list of pssm & rr files, and their parent directories
return pssm, rr, sys.argv[1], sys.argv[2]
if __name__ == '__main__':
main()