Deep-Learning-gene-expression/RandomGenes.py at master · macsbio/Deep-Learning-gene-expression · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import pickle
import numpy as np
import scipy.io as sio
import CompoundLists
import tensorflow as tf
from PredictEncoding import leave_one_out_evaluation
from SmallParser import SmallDatasetParser, get_doubling_xy, get_x, read_data, read_nested_orths
from Processing import correct_slopes, decorrect_slopes, normalize_features, normalize_with_norms, \
    denormalize_with_norms, normalize_total, split_train_test

'''
Warning: this is VERY CPU/disk intensive and may slow down / crash your computer.

The final output of the program will contain accuracy info, and separate files will be created for the random gene sets (x and y)
and the LOO prediction results. The genes will be printed before starting the LOO procedure.

A single execution will likely take several hours.
'''

def main():
    tf.logging.set_verbosity(tf.logging.ERROR)

    compound_list = CompoundLists.UNGENERAL_45  # Dan: I changed this line

    # Change these lines to change prediction direction.
    # NOTE: currently only supports rat vitro -> vitro and rat vitro -> rat vivo
    # i.e. x_type should always be rat_vitro
    x_type = "rat_vitro"
    y_type = "human_vitro"  # Dan: change here if necessary!

    x_timepoints = 3
    y_timepoints = 3
    if y_type == 'rat_vivo':
        y_timepoints = 4

    x_satisfied = False
    y_satisfied = False

    # Only use these lines if you already have the data files. Selection process will then be disabled.
    #x_satisfied = True
    #og_X, data_compounds, gene_list_x, X_gene_variance = pickle.load(open('Data/RatInVitro/20/data_X20_1.p', 'rb'))
    #y_satisfied = True
    #og_Y, data_compounds, gene_list_y, Y_gene_variance = pickle.load(open('Data/HumanInVitro/20/data_20_1_human.p', 'rb'))


    # Variances from ST gene list (steatosis, 50 genes)
    target_x_var = 0.001386  # rat in vitro
    target_y_var = 0.001325  # rat in vivo
    if y_type == 'human_vitro':
        target_y_var = 0.000986

    deviation = 0.03  # variance should be at most 3% more than the target and at least 3% less than the target
    x_var_low = target_x_var - target_x_var * deviation
    x_var_up = target_x_var + target_x_var * deviation
    y_var_low = target_y_var - target_y_var * deviation
    y_var_up = target_y_var + target_y_var * deviation


    print("X var range: {} - {}".format(x_var_low, x_var_up))
    print("Y var range: {} - {}".format(y_var_low, y_var_up))


    nested = True  #if true, input file needs to be selected below (scroll down)
    orthologs = False  #change here as desired

    # names of output files need to be manually adjusted (as desired)
    # to create the 'core' of a nest (e.g. 20), nested must be set to False!

    if nested == False:
        for k in range(80,81):  # desired number of genes
            numb_genes = k
            for j in range(99,100):  # desired number (i.e. names) of sets
                x_satisfied = False  # set to True if you want to select only one domain (not recommended)
                y_satisfied = False
                file1 = "data_X%d"%(numb_genes)+"_%d"%(j) + ".p"
                file2 = "data_%d"%(numb_genes)+"_%d"%(j) + "_human.p"  # change to desired domain here!

                while not x_satisfied or not y_satisfied:
                    og_X, og_Y, data_compounds, genes_x, genes_y = read_data(compound_list.copy(), x_type=x_type, y_type=y_type, \
                    gene_list='random', dataset="big", numb_genes=numb_genes, domain="both", orthologs=orthologs)
                    if not x_satisfied:
                        X, _ = normalize_total(og_X)
                        X_gene_means = np.zeros(numb_genes)
                        X_gene_variance = np.zeros(numb_genes)
                        for i in range(numb_genes):
                            X_gene_means[i] = np.mean(X[:, i * x_timepoints : i * x_timepoints + x_timepoints])
                            X_gene_variance[i] = np.var(X[:, i * x_timepoints : i * x_timepoints + x_timepoints])

                        if X_gene_variance.mean() >= x_var_low and X_gene_variance.mean() <= x_var_up:
                            print("X satisfied!")
                            x_satisfied = True
                            gene_list_x = genes_x
                            if not orthologs:
                                with open(file1, 'wb') as f:
                                    pickle.dump([og_X, data_compounds, gene_list_x, X_gene_variance], f)
                                    print("Dumped file ", file1)
                                    print("X genes: ", gene_list_x)
                        elif orthologs:  # occurs only if we select unnested orthologs (e.g. 20)
                            continue

                    if not y_satisfied:
                        Y, _ = normalize_total(og_Y)
                        Y_gene_means = np.zeros(numb_genes)
                        Y_gene_variance = np.zeros(numb_genes)
                        for i in range(numb_genes):
                            Y_gene_means[i] = np.mean(Y[:, i * y_timepoints : i * y_timepoints + y_timepoints])
                            Y_gene_variance[i] = np.var(Y[:, i * y_timepoints : i * y_timepoints + y_timepoints])

                        print("Y Mean:", Y_gene_means.mean())
                        print("Y Variance:", Y_gene_variance.mean())

                        if Y_gene_variance.mean() >= y_var_low and Y_gene_variance.mean() <= y_var_up:
                            print("Y satisfied!")
                            y_satisfied = True
                            gene_list_y = genes_y
                            if not orthologs:
                                with open(file2, 'wb') as f:
                                    pickle.dump([og_Y, data_compounds, gene_list_y, Y_gene_variance], f)
                                    print("Dumped file ", file1)
                                    print("Y genes: ", gene_list_y)
                        elif orthologs:
                            x_satisfied = False  # starting all over again

                    if x_satisfied and y_satisfied and orthologs:
                        with open(file1, 'wb') as f:
                            pickle.dump([og_X, data_compounds, gene_list_x, X_gene_variance], f)
                        with open(file2, 'wb') as f:
                            pickle.dump([og_Y, data_compounds, gene_list_y, Y_gene_variance], f)

    else:  # nested case
        numb_genes = 15  # number of genes to add
        numb_genes_core = 35  # size of set to build upon
        numb_genes_out = numb_genes + numb_genes_core
        for j in range(1,2):  #  desired number (and names) of output sets
            y_satisfied = False  # set True if only one domain is desired (not recommended)
            x_satisfied = False
            file_out1 = "data_X%d"%(numb_genes_out) + "_%d"%(j) + "_nest.p"
            file_out2 = "data_%d"%(numb_genes_out) + "_%d"%(j) + "_human_nest.p"  # Dan: adjust here if necessary
            #file_in1 = "data_X%d"%(numb_genes_core) + "_%d"%(j) + "_nest.p"
            #file_in2 = "data_%d"%(numb_genes_core) + "_%d"%(j) + "_human_nest.p"
            file_in1 = "Data/RatInVitro/%d"%(numb_genes_core) + "/Nested/Random%d"%(numb_genes_core) + \
            "/data_X%d"%(numb_genes_core) + "_%d"%(j) + "_nest.p"
            file_in2 = "Data/HumanInVitro/%d"%(numb_genes_core) + "/Nested/Random%d"%(numb_genes_core) + \
            "/data_%d"%(numb_genes_core) + "_%d"%(j) + "_human_nest.p"  # change name here as desired

            X_core, _, gene_list_x_core, variance_x_core = pickle.load(open(file_in1, "rb"))
            Y_core, _, gene_list_y_core, variance_y_core = pickle.load(open(file_in2, "rb"))

            while not x_satisfied or not y_satisfied:
                og_X, og_Y, data_compounds, gene_list_x, gene_list_y = read_data(compound_list.copy(), y_type=y_type, \
                gene_list='random', domain="both", numb_genes=numb_genes, orthologs=orthologs, genes_provided=gene_list_x_core)
                if not x_satisfied:
                    X_temp1, _ = normalize_total(og_X)
                    X_temp2, _ = normalize_total(X_core)
                    X = np.concatenate((X_temp1, X_temp2), axis=1)
                    X_gene_means = np.zeros(numb_genes_out)
                    X_gene_variance = np.zeros(numb_genes_out)
                    for i in range(numb_genes):
                        X_gene_means[i] = np.mean(X[:, i * x_timepoints : i * x_timepoints + x_timepoints])
                        X_gene_variance[i] = np.var(X[:, i * x_timepoints : i * x_timepoints + x_timepoints])
                        if X_gene_variance[i] >= x_var_low and X_gene_variance[i] <= x_var_up:
                            print("Gene ", gene_list_x[i], " has variance ", X_gene_variance[i])

                    print("X Mean:", X_gene_means.mean())
                    print("X Variance:", X_gene_variance.mean())

                    if X_gene_variance.mean() >= x_var_low and X_gene_variance.mean() <= x_var_up:
                        print("X satisfied!")
                        x_satisfied = True
                    elif orthologs:
                        continue

                if not y_satisfied:
                    Y_temp1, _ = normalize_total(og_Y)
                    Y_temp2, _ = normalize_total(Y_core)
                    Y = np.concatenate((Y_temp1, Y_temp2), axis=1)
                    Y_gene_means = np.zeros(numb_genes_out)
                    Y_gene_variance = np.zeros(numb_genes_out)
                    for i in range(numb_genes_out):
                        Y_gene_means[i] = np.mean(Y[:, i * y_timepoints : i * y_timepoints + y_timepoints])
                        Y_gene_variance[i] = np.var(Y[:, i * y_timepoints : i * y_timepoints + y_timepoints])

                    print("Y Mean:", Y_gene_means.mean())
                    print("Y Variance:", Y_gene_variance.mean())

                    if Y_gene_variance.mean() >= y_var_low and Y_gene_variance.mean() <= y_var_up:
                        print("Y satisfied!")
                        y_satisfied = True
                    elif orthologs:
                        x_satisfied = False  # Dan: starting all over again

                if x_satisfied and y_satisfied:
                    X_final = np.concatenate((og_X, X_core), axis=1)
                    for x in gene_list_x_core:
                        gene_list_x.append(x)
                    with open(file_out1, 'wb') as f:
                        pickle.dump([X_final, data_compounds, gene_list_x, X_gene_variance], f)
                    print("X genes:", gene_list_x)
                    print("X variance:", X_gene_variance.mean())

                    Y_final = np.concatenate((og_Y, Y_core), axis=1)
                    for x in gene_list_y_core:
                        gene_list_y.append(x)
                    with open(file_out2, 'wb') as f:
                        pickle.dump([Y_final, data_compounds, gene_list_y, Y_gene_variance], f)
                    print("Y genes:", gene_list_y)
                    print("Y variance:", Y_gene_variance.mean())


    print("\n\n\nGene selection done.")

    #----------------------------------------------
    global x_vivo, y_vivo
    x_vivo = x_type == "rat_vivo"
    y_vivo = y_type == "rat_vivo"
    # ---------------------------------------------

if __name__ == '__main__':
    main()