Skip to content

Commit 101b9ad

Browse files
committed
adds a legacy pixi environment and fixes some issues with the TSS cpg
bins python script (fixes #1)
1 parent 3f51e84 commit 101b9ad

3 files changed

Lines changed: 992 additions & 57 deletions

File tree

legacy/TSS_cpg_bins.py

Lines changed: 58 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,21 @@ def argparser():
1616
parser.add_argument("--prefix", required = True, help = "Prefix for the output file")
1717
parser.add_argument("--annotation", required = True, help = "Annotation GTF file")
1818
parser.add_argument("--plot", required = False, help = "Whether you want to plot the results or not")
19-
19+
2020
args = parser.parse_args()
2121
return args
2222

2323
def write_fasta(fasta_dict, file_handle, string):
2424
''' Writes a dict of sequences into a fasta file'''
25-
25+
2626
print("Writing {}".format(file_handle))
2727
out = open(file_handle, "w")
2828
for chrom in fasta_dict:
2929
if string == True:
3030
out.write(">{}\n{}\n".format(chrom, fasta_dict[chrom]))
3131
else:
3232
out.write(">{}\n{}\n".format(chrom, "".join(map(str, fasta_dict[chrom]))))
33-
33+
3434

3535
def parse_fasta(fasta):
3636
''' Parses a fasta file and stores it in a dictionary'''
@@ -61,7 +61,7 @@ def parse_fasta(fasta):
6161
return fasta_dict
6262

6363
def convert_to_binary(fasta_dict):
64-
'''
64+
'''
6565
Reads a fasta dictionary and converts it to 'binary', in which
6666
every CG position is a 1, and all other bases are 0s
6767
'''
@@ -78,9 +78,9 @@ def convert_to_binary(fasta_dict):
7878
for pos in cgs:
7979
bin_dict[chrom][pos] = 1
8080
bin_dict[chrom][pos + 1] = 1
81-
81+
8282
return bin_dict, cpg_bin_dict
83-
83+
8484
def parse_binary_fasta(binary_fasta):
8585
''' Parses a pre-made 'binary' fasta, assuming the sequences are on one line each'''
8686

@@ -107,11 +107,11 @@ def parse_binary_fasta(binary_fasta):
107107
else:
108108
bin_dict[chrom] = line.strip()
109109
binary_fasta.close()
110-
110+
111111
return bin_dict, cpg_bin_dict
112112

113113
def make_cpg_dict(bin_dict):
114-
'''
114+
'''
115115
'''
116116

117117
print("Making the CpG dict")
@@ -123,16 +123,16 @@ def make_cpg_dict(bin_dict):
123123
return cpg_bin_dict
124124

125125
def parse_all_bins(all_bins_file):
126-
126+
127127
print("Parsing the bins file")
128128
file = open(all_bins_file, "r")
129129
for line in file:
130130
all_bins = line.strip().split("\t")
131-
131+
132132
all_bins = [int(x) for x in all_bins]
133-
133+
134134
return(all_bins)
135-
135+
136136

137137

138138
def parse_annotation(annotation):
@@ -160,8 +160,8 @@ def parse_annotation(annotation):
160160
end = start + 4000
161161
tss_dict[chrom].append(((start, end), line[6]))
162162
annotation.close()
163-
164-
return tss_dict
163+
164+
return tss_dict
165165

166166
def parse_methylkit(methylkit, cpg_bin_dict):
167167
''' Turns any 0s in cpg_bin_dict to 1s if present in methylkit file'''
@@ -173,22 +173,20 @@ def parse_methylkit(methylkit, cpg_bin_dict):
173173
methylkit_file = open(methylkit, "r")
174174
for line in methylkit_file:
175175
if methylkit.endswith(".gz"):
176-
line = str(line, "utf-8")
176+
line = str(line, "utf-8")
177177
if not line.startswith("chrBase"):
178178
line = line.split("\t")
179179
pos = int(line[2]) - 1
180180
chrom = line[1]
181-
# cpg_bin_dict[chrom][pos] = 1
182-
if chrom == "chr13":
181+
if chrom in cpg_bin_dict:
183182
cpg_bin_dict[chrom][pos] = 1
184-
185-
# for chrom in cpg_bin_dict:
186-
for chrom in ["chr13"]:
183+
184+
for chrom in cpg_bin_dict:
187185
temp = "".join(map(str, cpg_bin_dict[chrom]))
188186
cpg_bin_dict[chrom] = temp
189-
187+
190188
return cpg_bin_dict
191-
189+
192190

193191
def calculate_meth_bin(tss_dict, cpg_bin_dict):
194192
'''
@@ -199,7 +197,6 @@ def calculate_meth_bin(tss_dict, cpg_bin_dict):
199197

200198
methylation_bins = [0 for x in range(400)]
201199
for chrom in tss_dict:
202-
# for chrom in ["chr13"]:
203200
print(chrom)
204201
for pos in tss_dict[chrom]:
205202
meth_slice = cpg_bin_dict[chrom][pos[0][0]:pos[0][1]]
@@ -212,17 +209,16 @@ def calculate_meth_bin(tss_dict, cpg_bin_dict):
212209
else:
213210
methylation_bins[399 - my_bin] += 1
214211
my_bin += 1
215-
212+
216213
return methylation_bins
217-
214+
218215
def calculate_both_bins(tss_dict, bin_dict, cpg_bin_dict):
219216
''' Will calculate both all_bins and methylation_bins if neither is specified'''
220217

221218
print("Calculating the bins...")
222219
all_bins = [0] * 400
223220
methylation_bins = [0] * 400
224-
# for chrom in tss_dict:
225-
for chrom in ["chr13"]:
221+
for chrom in tss_dict:
226222
print(chrom)
227223
for pos in tss_dict[chrom]:
228224
my_slice = bin_dict[chrom][pos[0][0]:pos[0][1]]
@@ -235,7 +231,7 @@ def calculate_both_bins(tss_dict, bin_dict, cpg_bin_dict):
235231
else:
236232
all_bins[399 - my_bin] += 1
237233
my_bin += 1
238-
234+
239235
my_bin = 0
240236
for x in range(0,len(meth_slice), 10):
241237
if "1" in meth_slice[x:x+10]:
@@ -244,25 +240,33 @@ def calculate_both_bins(tss_dict, bin_dict, cpg_bin_dict):
244240
else:
245241
methylation_bins[399 - my_bin] += 1
246242
my_bin += 1
247-
243+
248244
return all_bins, methylation_bins
249245

250246
def normalize_bins(all_bins, methylation_bins):
251247
''' Normalizes the methylation bin to percent of bins covered that have CpGs'''
252-
253-
print("Normalizing the bins")
254-
normal_all_bins = [(all_bins[x] / all_bins[x]) * 100 for x in range(len(all_bins))]
255-
normal_methylation_bins = [(methylation_bins[x] / all_bins[x]) * 100 for x in range(len(all_bins))]
256248

257-
return(normal_all_bins, normal_methylation_bins)
249+
print("Normalizing the bins")
250+
normal_all_bins = [100 if all_bins[x] != 0 else 0 for x in range(len(all_bins))]
251+
normal_methylation_bins = [(methylation_bins[x] / all_bins[x]) * 100 if all_bins[x] != 0 else 0 for x in range(len(all_bins))]
252+
253+
return(normal_all_bins, normal_methylation_bins)
254+
255+
def plot_bins(all_bins, methylation_bins, prefix, suffix=""):
256+
''' Plots the bins and saves to file'''
258257

259-
def plot_bins(all_bins, methylation_bins):
260-
''' Plots the bins simply'''
261-
262258
print("Plotting!")
263-
plt.plot(all_bins)
264-
plt.plot(methylation_bins)
265-
plt.show()
259+
plt.figure(figsize=(10, 6))
260+
plt.plot(all_bins, label='All CpG bins')
261+
plt.plot(methylation_bins, label='Methylation bins')
262+
plt.xlabel('Bin position (10bp bins around TSS +/- 2kb)')
263+
plt.ylabel('Count')
264+
plt.title(f'TSS CpG Distribution{suffix}')
265+
plt.legend()
266+
output_file = f"{prefix}{suffix}_plot.png"
267+
plt.savefig(output_file, dpi=150, bbox_inches='tight')
268+
print(f"Saved plot to {output_file}")
269+
plt.close()
266270

267271
def write_files(all_bins, methylation_bins, normal_methylation_bins, prefix):
268272
''' Writes the bins to tab delimited files'''
@@ -272,25 +276,25 @@ def write_files(all_bins, methylation_bins, normal_methylation_bins, prefix):
272276
out = open("{}_allbins_raw.tab".format(prefix), "w")
273277
out.write("\t".join(map(str, all_bins)))
274278
out.close()
275-
279+
276280
out = open("{}_methylation_raw.tab".format(prefix), "w")
277281
out.write("\t".join(map(str, methylation_bins)))
278282
out.close()
279-
283+
280284
out = open("{}_methylation_normalized.tab".format(prefix), "w")
281285
out.write("\t".join(map(str, normal_methylation_bins)))
282286
out.close()
283-
284-
287+
288+
285289

286290
def run_script():
287291
print("Running script...")
288292
args = argparser()
289-
293+
290294
if args.fasta == None and args.binary_fasta == None:
291295
print("Need either a fasta or the 'binary' fasta file!")
292296
sys.exit()
293-
297+
294298
if args.binary_fasta:
295299
bin_dict = parse_fasta(args.binary_fasta)
296300
if args.binary_methylkit:
@@ -302,27 +306,24 @@ def run_script():
302306
fasta_dict = parse_fasta(args.fasta)
303307
bin_dict, cpg_bin_dict = convert_to_binary(fasta_dict)
304308
write_fasta(bin_dict, "{}_cpgs.fa".format(args.prefix), False)
305-
309+
306310
tss_dict = parse_annotation(args.annotation)
307311
write_fasta(cpg_bin_dict, "{}_covered_cpgs.fa".format(args.prefix), False)
308-
312+
309313
if args.all_bins:
310314
all_bins = parse_all_bins(args.all_bins)
311315
methylation_bins = calculate_meth_bin(tss_dict, cpg_bin_dict)
312316
else:
313317
all_bins, methylation_bins = calculate_both_bins(tss_dict, bin_dict, cpg_bin_dict)
314-
318+
315319
normal_all_bins, normal_methylation_bins = normalize_bins(all_bins, methylation_bins)
316-
320+
317321
write_files(all_bins, methylation_bins, normal_methylation_bins, args.prefix)
318-
322+
319323
if args.plot:
320-
plot_bins(all_bins, methylation_bins)
321-
plot_bins(normal_all_bins, normal_methylation_bins)
322-
324+
plot_bins(all_bins, methylation_bins, args.prefix, "_raw")
325+
plot_bins(normal_all_bins, normal_methylation_bins, args.prefix, "_normalized")
326+
323327
if __name__ == "__main__":
324328
run_script()
325329
print("All done!")
326-
327-
328-

0 commit comments

Comments
 (0)