Project4/train.py at master · compbiospring2019/Project4 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# functions used to train and create the model
import utils
import os
from random import sample
import sys
from math import exp, log
import test
import classify

empty_row = {'A': -1, 'C': -1, 'E': -1, 'D': -1, 'G': -1, 'I': -1, 'H': -1, 'K': -1, 'F': -1, 'M': -1, 'L': -1, 'N': -1, 'Q': -1, 'P': -1, 'S': -1, 'R': -1, 'T': -1, 'W': -1, 'V': -1, 'Y': -1}
acids_list = ['A', 'C', 'E', 'D', 'G', 'I', 'H', 'K', 'F', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y']

# Get the parent directory of this code
this_script = os.path.abspath(__file__)
parent_directory = os.path.dirname(this_script)

# Global variables
SAMPLE_SIZE = 10
STEP_SIZE = 0.001


def main():
    # Read in PSSM/RR files
    pssm_list, rr_list, pssm_dir, rr_dir = parse_args()
    pssm_train = sample(pssm_list, int(0.75 * len(pssm_list)))
    pssm_test = [pssm for pssm in pssm_list if pssm not in pssm_train]
    train(pssm_train, pssm_dir, rr_dir)
    test.main(pssm_test, pssm_dir, rr_dir)
    classify.main(pssm_test, pssm_dir, rr_dir)


def build_feature_matrix(pssm_files, pssm_dir, rr_dir):
    """
    Builds a feature matrix based on PSSM and RR files
    :return: List of dictionaries. Keys are feature numbers(0 - 199) and 'class'
    """
    print('Building feature matrix...')
    feature_matrix = []

    for pssm_file in pssm_files:
        # For each training file, read in the PSSM matrix and the RR file
        pssm = utils.read_pssm(pssm_file, pssm_dir)
        rr = utils.read_rr(pssm_file.replace('.pssm', '.rr'), rr_dir)
        intermediate_matrix = build_small_matrix(pssm)

        # Get a random sample of (i, j) pairs to balance data
        pairs = [(i, j) for i in range(len(intermediate_matrix)) for j in range(i+5, len(intermediate_matrix)) if (i, j) not in rr]
        pairs = sample(pairs, len(rr))

        def build_row(class_label):
            # Build row for feature matrix:
            feature_row = {'class': class_label}
            feature_row.update(intermediate_matrix[i])
            for feat_num in range(len(intermediate_matrix[j])):
                feature_row[100 + feat_num] = intermediate_matrix[j][feat_num]
            # feature_matrix.append(feature_row)
            return feature_row

        # For each i, j pair, make a row for the feature matrix
        for i, j in pairs:
            feature_matrix.append(build_row(0))
        pair_keys = [key for key in rr.keys() if key != 'sequence']
        for i, j in pair_keys:
            feature_matrix.append(build_row(1))
    return feature_matrix


def build_small_matrix(pssm):
    """
    Builds the 'small' intermediate matrix. Rows are 100 features from sliding window of size 5.
    :return: List of dictionaries. Keys are feature numbers (0 - 99)
    """
    small_matrix = []
    for row_num in range(len(pssm)):
        # For each amino acid in the PSSM, build a line for the feature matrix
        feature = {}
        for row_offset in range(-2, 3):
            if row_num + row_offset < 0:
                # We're at the top of the PSSM
                values = empty_row
            elif row_num + row_offset >= len(pssm):
                # We're at the bottom of the PSSM
                values = empty_row
            else:
                # We're somewhere in the middle
                values = pssm[row_num + row_offset]
            for val_num, acid in enumerate(acids_list):
                feature[((row_offset + 2) * 20) + val_num] = values[acid]
        small_matrix.append(feature)
    return small_matrix


def train(pssm_list, pssm_dir, rr_dir):
    """
    Train the model using gradient ascent. Save the model.
    """
    # Build feature matrix
    feature_matrix = build_feature_matrix(pssm_list, pssm_dir, rr_dir)

    w_vector = new_w_vector()
    gradient_vector = None

    print('Training the model...')
    while not reached_top(w_vector, gradient_vector):
        gradient_vector = calc_gradient(w_vector, feature_matrix)
        w_vector = update_w(w_vector, gradient_vector)

    # Save the model to the file
    utils.write_model(w_vector)


def calc_gradient(w_vector, matrix):
    """
    Calculates the gradient based on (SAMPLE_SIZE) training examples
    :return: gradient vector
    """
    gradient_vector = [0] * len(w_vector)
    training_data = sample(matrix, SAMPLE_SIZE)

    for training_example in training_data:
        # Calculate P(Y=1|X,w)
        p_hat = 1.0 / (1 + exp(calc_sum(w_vector, training_example)))

        # Deal with w0
        gradient_vector[0] += training_example['class'] - p_hat

        # For each feature, calculate the gradient
        for i in range(1, len(w_vector)):
            gradient_vector[i] += training_example[i-1] * (training_example['class'] - p_hat)

    return gradient_vector


def update_w(w_vector, gradient_vector):
    """
    Updates each w value in the w_vector
    :return: w_vector
    """
    for index in range(len(w_vector)):
        w_vector[index] += STEP_SIZE * gradient_vector[index]
    return w_vector


def reached_top(w_vector, gradient_vector):
    """
    Check if we've reached the top of the mountain
    :return: boolean
    """
    if not gradient_vector:
        return False
    for i in range(len(gradient_vector)):
        if gradient_vector[i] > 0.05:
            return False
    print('Reached the top!')
    return True


def calc_max_conditional_likelihood(w_vector, feature_matrix):
    sum_mcl = 0.0

    for feature in feature_matrix:
        feature_sum = calc_sum(w_vector, feature)
        sum_mcl += feature['class'] * feature_sum - log(1 + exp(feature_sum))

    return sum_mcl


def new_w_vector():
    """
    Make a new w vector.
    TODO: Random values for 'base camp'?
    :return: list of length 201
    """
    return [1] * 201


def calc_sum(w_vector, feature_vector):
    """
    Calculates the sum w0 + SUM(wi * xi)
    """
    sum_of_w = w_vector[0]
    for i in range(1, len(w_vector)):
        sum_of_w += w_vector[i] * feature_vector[i - 1]

    return sum_of_w


err_msg = '''
Please enter two directory names (absolute paths)
containing sequences for linear regression training data
(with double quotes around them if they have spaces).
The directory with PSSM files should come first,
followed by the path to the .rr files.'''


def parse_args():
    if len(sys.argv) < 3:
        print(err_msg)
        sys.exit()

    try:
        # Get the lists of pssm and rr file names
        pssm = utils.read_directory_contents(sys.argv[1], '.pssm')
        rr = utils.read_directory_contents(sys.argv[2], '.rr')
    except:
        # Given paths are not valid directories
        print(err_msg)
        sys.exit()

    # Return list of pssm & rr files, and their parent directories
    return pssm, rr, sys.argv[1], sys.argv[2]


if __name__ == '__main__':
    main()