-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFantom.py
More file actions
293 lines (222 loc) · 11.1 KB
/
Fantom.py
File metadata and controls
293 lines (222 loc) · 11.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
import pandas as pd
import time
from multiprocessing import Pool, Manager
from itertools import repeat
import itertools
import numpy as np
import gc
import random
import pickle as pk
import os
from Bio import SeqIO
class Driver():
def __init__(self, fantom_file, pickle_file, triplet_sim_file, triplet_dis_file, fasta_dir, sequence_file, cores, iteration):
"""
fantom_file - path to fantom matrix
pickle_file - path to pickle file; does not have to exist and can be made; if exists, can be laoded
triplet_sim_file - output of the similar triplet tensor
triplet_dis_file - output of the dissimilar triplet tensor
fasta_dir - fasta files for each chromosome
sequence_file - Output each enhancer in a multi-fasta format file
"""
self.fantom_file = fantom_file
self.pickle_file = pickle_file
self.triplet_sim_file = triplet_sim_file
self.triplet_dis_file = triplet_dis_file
self.sequence_file = sequence_file
self.fasta_dir = fasta_dir
self.crm_dict = None
assert os.path.exists(self.fantom_file)
# assert os.path.exists(self.fasta_dir)
self.iteration = iteration
self.cores = cores # Time: 524.9272162914276
# Time: 786.3049099445343
#nrows = 1000
self.df_clean = pd.read_csv(self.fantom_file, sep='\t', index_col = 0).astype(np.uintc) # ,nrows=nrows
# # Drop regions that do not show activity in any of the cell types
# row_sum_list = df.sum(axis=1)
# self.df_clean = df[row_sum_list > 0]
# self.df_clean = self.df_clean.astype(np.uintc)
# label_list = list(self.df_clean.index)
# print(len(self.df_clean))
# keep_list = []
# for label in label_list:
# colon = label.find(":")
# dash = label.find("-")
# start = int(label[colon + 1:dash])
# end = int(label[dash + 1:])
# size = end-start
# if size >= 100 and size <= 1000:
# keep_list.append(label)
# self.df_clean = self.df_clean.loc[keep_list]
# print(len(self.df_clean))
# Get row names
row_names = self.df_clean.index.tolist()
self.crm_index_table = {}
for index, a_crm in enumerate(row_names):
self.crm_index_table[a_crm] = index
self.all_index_array = np.arange(0, len(self.crm_index_table), dtype=np.uintc)
group_by = int(len(row_names) / self.cores) + 1
self.partition_list = [row_names[ x * group_by : min((x + 1) * group_by, len(row_names))] for x in range(self.cores)]
def make_crm_dict(self):
'''
Make a dictionary of CRM objects that will be run using multiprocessing.
Then, saving it into a file for memory efficiency.
'''
time_start = time.time()
with Pool(processes=self.cores) as p:
element_list = p.map(self.make_crm_dict_parallel, self.partition_list)
time_end = time.time()
print('Time: ', time_end-time_start)
self.crm_dict = None
self.crm_dict = {}
for dictionary in element_list:
self.crm_dict.update(dictionary)
with open(self.pickle_file, 'wb') as f:
pk.dump(self.crm_dict, f)
self.crm_list = list(self.crm_dict.values())
del self.crm_dict
gc.collect()
def make_crm_dict_parallel(self, name_list):
'''
Make a dictionary of CRM objects. Each object knows its similar and dissimilar regions.
Function for multiprocessing.
'''
# A table of the following format: index -> a np array of ushort
r = {}
counter = 0
l = len(name_list)
for name in name_list:
sim_crm_mask = np.any(self.df_clean.values[:, None] & self.df_clean.loc[name].values, axis=2).any(axis=1)
sim_list = self.df_clean.index[sim_crm_mask].tolist()
r[self.crm_index_table[name]] = np.array([self.crm_index_table[a_crm] for a_crm in sim_list], dtype=np.uintc)
if counter % 100 == 0:
print(f'{counter} out of {l}')
counter += 1
print(f'{l} out of {l}')
return r
def read_crm_dict(self):
'''
Loads the dictionary of CRM objects.
'''
assert os.path.exists(self.pickle_file)
with open(self.pickle_file, 'rb') as f:
self.crm_dict = pk.load(f)
self.crm_list = list(self.crm_dict.values())
del self.crm_dict
gc.collect()
# def get_similar(self, is_less):
# def get_similar_replace():
# pass
# def get_similar_without():
# pass
# return get_similar_replace if is_less else get_similar_without
def make_negative_list(self, positive_array, anchor_sim_array):
'''
Creating a negative list from all of the arrays (anchor, positive, and negative) minus the union of both the positive and the anchor arrays.
'''
r = np.zeros(self.iteration)
for x, positive_i in enumerate(positive_array):
# Similar instances to the positive
positive_sim_array = self.crm_list[positive_i]
# Union of the positive and anchor similar sets
union_array = np.union1d(anchor_sim_array, positive_sim_array)
# Set of instances that neither positive or anchor are similar to, i.e., negative
negative_array = np.setdiff1d(self.all_index_array, union_array)
assert len(negative_array) > 0, "The negative array has no negatives!"
r[x] = np.random.choice(negative_array)
return r
# def make_similar_list(self, positive_array):
# similar_array = positive_array.copy()
# np.random.shuffle(similar_array)
# for i in range(len(similar_array)):
# while similar_array[i] == positive_array[i]:
# similar_array[i] = np.random.choice(positive_array, size=1, replace = True)
# return similar_array
def make_similar_list(self, anchor, anchor_sim_array, positive_array):
# similar_array = positive_array.copy()
# np.random.shuffle(similar_array)
# for i in range(len(similar_array)):
# while similar_array[i] == positive_array[i]:
# similar_array[i] = np.random.choice(positive_array, size=1, replace = True)
# Loop through the positive array
similar_array = np.zeros(positive_array.shape)
ar = self.df_clean.iloc[anchor].values
for i,p in enumerate(positive_array):
positive_sim_array = self.crm_list[p]
pr = self.df_clean.iloc[p].values
intersect_array = np.intersect1d(anchor_sim_array, positive_sim_array, assume_unique=True)
# randomize the intersect array
np.random.shuffle(intersect_array)
found = False
for val in intersect_array:
sr = self.df_clean.iloc[val].values
# check for overlapping columns of ar, pr, and sr
if np.any(ar & pr & sr):
similar_array[i] = val
found = True
break
assert found, "No similar region found!"
return similar_array
def fill_anchor_block(self, i_list, is_similar):
'''
Creating the anchor, positive, and negative tensors and filling the anchor block with these tensors.
Function that will be utilized for multiprocessing.
'''
block = np.zeros((len(i_list), 3, self.iteration), dtype=np.uintc)
for index, anchor in enumerate(i_list):
if index % 100 == 0:
print(f'Processed {index} out of {len(i_list)}')
# Fill the anchor's channels
block[index, 0, :] = anchor
# Fill the positive's channels
anchor_sim_array = self.crm_list[anchor]
is_replace = True if len(anchor_sim_array) < self.iteration else False
positive_array = np.random.choice(anchor_sim_array, size=self.iteration, replace = is_replace).astype(np.uintc)
block[index, 1, :] = positive_array
# Fill the negative list or similar list
if is_similar:
# block[index, 2, :] = self.make_similar_list(positive_array)
block[index, 2, :] = self.make_similar_list(anchor, anchor_sim_array, positive_array)
else:
block[index, 2, :] = self.make_negative_list(block[index, 1, :], anchor_sim_array)
return block
# Time: 2388.9494280815125
def make_triplet_tensor(self, is_similar):
'''
Creating triplet tensors with the use of multiprocessing which will then be used to concatenate them all together to form
a triplet matrix.
'''
assert self.crm_list != None, "crm_list doesn't exist! Call make_crm_dict() or read_crm_dict()!"
start = time.time()
with Pool(processes=self.cores) as p:
group_by = int(len(self.crm_list) / self.cores) + 1
i_list = [list( range( x * group_by, min((x + 1) * group_by, len(self.crm_list)) ) ) for x in range(self.cores)]
anchor_block_list = p.starmap(self.fill_anchor_block, zip(i_list, repeat(is_similar)))
triplet_matrix = np.concatenate(anchor_block_list)
print(time.time() - start)
if is_similar:
np.save(self.triplet_sim_file, triplet_matrix)
else:
np.save(self.triplet_dis_file, triplet_matrix)
return triplet_matrix
def write_sequences(self, fasta_header = "HG38_", chrom_list = ["chr" + str(x) for x in range(1, 23)] + ["chrX", "chrY"]):
fasta_header = f"{self.fasta_dir}/{fasta_header}"
fasta_dict = {f"{chrom}": str(SeqIO.read(f'{fasta_header}{chrom}.fa', 'fasta').seq) for chrom in chrom_list}
with open(self.sequence_file, 'w') as file:
for label in list(self.df_clean.index):
colon = label.find(":")
dash = label.find("-")
chrom = label[:colon]
start = int(label[colon + 1:dash])
end = int(label[dash + 1:])
seq = fasta_dict[chrom][start:end]
file.write(f">{label}\n")
file.write(seq.upper() + "\n")
def run(self, is_write = True, fasta_header = "HG38_", chrom_list = ["chr" + str(x) for x in range(1, 23)] + ["chrX", "chrY"]):
# self.make_crm_dict()
self.read_crm_dict()
self.make_triplet_tensor(True)
self.make_triplet_tensor(False)
if is_write:
self.write_sequences(fasta_header, chrom_list)