@@ -16,21 +16,21 @@ def argparser():
1616 parser .add_argument ("--prefix" , required = True , help = "Prefix for the output file" )
1717 parser .add_argument ("--annotation" , required = True , help = "Annotation GTF file" )
1818 parser .add_argument ("--plot" , required = False , help = "Whether you want to plot the results or not" )
19-
19+
2020 args = parser .parse_args ()
2121 return args
2222
2323def write_fasta (fasta_dict , file_handle , string ):
2424 ''' Writes a dict of sequences into a fasta file'''
25-
25+
2626 print ("Writing {}" .format (file_handle ))
2727 out = open (file_handle , "w" )
2828 for chrom in fasta_dict :
2929 if string == True :
3030 out .write (">{}\n {}\n " .format (chrom , fasta_dict [chrom ]))
3131 else :
3232 out .write (">{}\n {}\n " .format (chrom , "" .join (map (str , fasta_dict [chrom ]))))
33-
33+
3434
3535def parse_fasta (fasta ):
3636 ''' Parses a fasta file and stores it in a dictionary'''
@@ -61,7 +61,7 @@ def parse_fasta(fasta):
6161 return fasta_dict
6262
6363def convert_to_binary (fasta_dict ):
64- '''
64+ '''
6565 Reads a fasta dictionary and converts it to 'binary', in which
6666 every CG position is a 1, and all other bases are 0s
6767 '''
@@ -78,9 +78,9 @@ def convert_to_binary(fasta_dict):
7878 for pos in cgs :
7979 bin_dict [chrom ][pos ] = 1
8080 bin_dict [chrom ][pos + 1 ] = 1
81-
81+
8282 return bin_dict , cpg_bin_dict
83-
83+
8484def parse_binary_fasta (binary_fasta ):
8585 ''' Parses a pre-made 'binary' fasta, assuming the sequences are on one line each'''
8686
@@ -107,11 +107,11 @@ def parse_binary_fasta(binary_fasta):
107107 else :
108108 bin_dict [chrom ] = line .strip ()
109109 binary_fasta .close ()
110-
110+
111111 return bin_dict , cpg_bin_dict
112112
113113def make_cpg_dict (bin_dict ):
114- '''
114+ '''
115115 '''
116116
117117 print ("Making the CpG dict" )
@@ -123,16 +123,16 @@ def make_cpg_dict(bin_dict):
123123 return cpg_bin_dict
124124
125125def parse_all_bins (all_bins_file ):
126-
126+
127127 print ("Parsing the bins file" )
128128 file = open (all_bins_file , "r" )
129129 for line in file :
130130 all_bins = line .strip ().split ("\t " )
131-
131+
132132 all_bins = [int (x ) for x in all_bins ]
133-
133+
134134 return (all_bins )
135-
135+
136136
137137
138138def parse_annotation (annotation ):
@@ -160,8 +160,8 @@ def parse_annotation(annotation):
160160 end = start + 4000
161161 tss_dict [chrom ].append (((start , end ), line [6 ]))
162162 annotation .close ()
163-
164- return tss_dict
163+
164+ return tss_dict
165165
166166def parse_methylkit (methylkit , cpg_bin_dict ):
167167 ''' Turns any 0s in cpg_bin_dict to 1s if present in methylkit file'''
@@ -173,22 +173,20 @@ def parse_methylkit(methylkit, cpg_bin_dict):
173173 methylkit_file = open (methylkit , "r" )
174174 for line in methylkit_file :
175175 if methylkit .endswith (".gz" ):
176- line = str (line , "utf-8" )
176+ line = str (line , "utf-8" )
177177 if not line .startswith ("chrBase" ):
178178 line = line .split ("\t " )
179179 pos = int (line [2 ]) - 1
180180 chrom = line [1 ]
181- # cpg_bin_dict[chrom][pos] = 1
182- if chrom == "chr13" :
181+ if chrom in cpg_bin_dict :
183182 cpg_bin_dict [chrom ][pos ] = 1
184-
185- # for chrom in cpg_bin_dict:
186- for chrom in ["chr13" ]:
183+
184+ for chrom in cpg_bin_dict :
187185 temp = "" .join (map (str , cpg_bin_dict [chrom ]))
188186 cpg_bin_dict [chrom ] = temp
189-
187+
190188 return cpg_bin_dict
191-
189+
192190
193191def calculate_meth_bin (tss_dict , cpg_bin_dict ):
194192 '''
@@ -199,7 +197,6 @@ def calculate_meth_bin(tss_dict, cpg_bin_dict):
199197
200198 methylation_bins = [0 for x in range (400 )]
201199 for chrom in tss_dict :
202- # for chrom in ["chr13"]:
203200 print (chrom )
204201 for pos in tss_dict [chrom ]:
205202 meth_slice = cpg_bin_dict [chrom ][pos [0 ][0 ]:pos [0 ][1 ]]
@@ -212,17 +209,16 @@ def calculate_meth_bin(tss_dict, cpg_bin_dict):
212209 else :
213210 methylation_bins [399 - my_bin ] += 1
214211 my_bin += 1
215-
212+
216213 return methylation_bins
217-
214+
218215def calculate_both_bins (tss_dict , bin_dict , cpg_bin_dict ):
219216 ''' Will calculate both all_bins and methylation_bins if neither is specified'''
220217
221218 print ("Calculating the bins..." )
222219 all_bins = [0 ] * 400
223220 methylation_bins = [0 ] * 400
224- # for chrom in tss_dict:
225- for chrom in ["chr13" ]:
221+ for chrom in tss_dict :
226222 print (chrom )
227223 for pos in tss_dict [chrom ]:
228224 my_slice = bin_dict [chrom ][pos [0 ][0 ]:pos [0 ][1 ]]
@@ -235,7 +231,7 @@ def calculate_both_bins(tss_dict, bin_dict, cpg_bin_dict):
235231 else :
236232 all_bins [399 - my_bin ] += 1
237233 my_bin += 1
238-
234+
239235 my_bin = 0
240236 for x in range (0 ,len (meth_slice ), 10 ):
241237 if "1" in meth_slice [x :x + 10 ]:
@@ -244,25 +240,33 @@ def calculate_both_bins(tss_dict, bin_dict, cpg_bin_dict):
244240 else :
245241 methylation_bins [399 - my_bin ] += 1
246242 my_bin += 1
247-
243+
248244 return all_bins , methylation_bins
249245
250246def normalize_bins (all_bins , methylation_bins ):
251247 ''' Normalizes the methylation bin to percent of bins covered that have CpGs'''
252-
253- print ("Normalizing the bins" )
254- normal_all_bins = [(all_bins [x ] / all_bins [x ]) * 100 for x in range (len (all_bins ))]
255- normal_methylation_bins = [(methylation_bins [x ] / all_bins [x ]) * 100 for x in range (len (all_bins ))]
256248
257- return (normal_all_bins , normal_methylation_bins )
249+ print ("Normalizing the bins" )
250+ normal_all_bins = [100 if all_bins [x ] != 0 else 0 for x in range (len (all_bins ))]
251+ normal_methylation_bins = [(methylation_bins [x ] / all_bins [x ]) * 100 if all_bins [x ] != 0 else 0 for x in range (len (all_bins ))]
252+
253+ return (normal_all_bins , normal_methylation_bins )
254+
255+ def plot_bins (all_bins , methylation_bins , prefix , suffix = "" ):
256+ ''' Plots the bins and saves to file'''
258257
259- def plot_bins (all_bins , methylation_bins ):
260- ''' Plots the bins simply'''
261-
262258 print ("Plotting!" )
263- plt .plot (all_bins )
264- plt .plot (methylation_bins )
265- plt .show ()
259+ plt .figure (figsize = (10 , 6 ))
260+ plt .plot (all_bins , label = 'All CpG bins' )
261+ plt .plot (methylation_bins , label = 'Methylation bins' )
262+ plt .xlabel ('Bin position (10bp bins around TSS +/- 2kb)' )
263+ plt .ylabel ('Count' )
264+ plt .title (f'TSS CpG Distribution{ suffix } ' )
265+ plt .legend ()
266+ output_file = f"{ prefix } { suffix } _plot.png"
267+ plt .savefig (output_file , dpi = 150 , bbox_inches = 'tight' )
268+ print (f"Saved plot to { output_file } " )
269+ plt .close ()
266270
267271def write_files (all_bins , methylation_bins , normal_methylation_bins , prefix ):
268272 ''' Writes the bins to tab delimited files'''
@@ -272,25 +276,25 @@ def write_files(all_bins, methylation_bins, normal_methylation_bins, prefix):
272276 out = open ("{}_allbins_raw.tab" .format (prefix ), "w" )
273277 out .write ("\t " .join (map (str , all_bins )))
274278 out .close ()
275-
279+
276280 out = open ("{}_methylation_raw.tab" .format (prefix ), "w" )
277281 out .write ("\t " .join (map (str , methylation_bins )))
278282 out .close ()
279-
283+
280284 out = open ("{}_methylation_normalized.tab" .format (prefix ), "w" )
281285 out .write ("\t " .join (map (str , normal_methylation_bins )))
282286 out .close ()
283-
284-
287+
288+
285289
286290def run_script ():
287291 print ("Running script..." )
288292 args = argparser ()
289-
293+
290294 if args .fasta == None and args .binary_fasta == None :
291295 print ("Need either a fasta or the 'binary' fasta file!" )
292296 sys .exit ()
293-
297+
294298 if args .binary_fasta :
295299 bin_dict = parse_fasta (args .binary_fasta )
296300 if args .binary_methylkit :
@@ -302,27 +306,24 @@ def run_script():
302306 fasta_dict = parse_fasta (args .fasta )
303307 bin_dict , cpg_bin_dict = convert_to_binary (fasta_dict )
304308 write_fasta (bin_dict , "{}_cpgs.fa" .format (args .prefix ), False )
305-
309+
306310 tss_dict = parse_annotation (args .annotation )
307311 write_fasta (cpg_bin_dict , "{}_covered_cpgs.fa" .format (args .prefix ), False )
308-
312+
309313 if args .all_bins :
310314 all_bins = parse_all_bins (args .all_bins )
311315 methylation_bins = calculate_meth_bin (tss_dict , cpg_bin_dict )
312316 else :
313317 all_bins , methylation_bins = calculate_both_bins (tss_dict , bin_dict , cpg_bin_dict )
314-
318+
315319 normal_all_bins , normal_methylation_bins = normalize_bins (all_bins , methylation_bins )
316-
320+
317321 write_files (all_bins , methylation_bins , normal_methylation_bins , args .prefix )
318-
322+
319323 if args .plot :
320- plot_bins (all_bins , methylation_bins )
321- plot_bins (normal_all_bins , normal_methylation_bins )
322-
324+ plot_bins (all_bins , methylation_bins , args . prefix , "_raw" )
325+ plot_bins (normal_all_bins , normal_methylation_bins , args . prefix , "_normalized" )
326+
323327if __name__ == "__main__" :
324328 run_script ()
325329 print ("All done!" )
326-
327-
328-
0 commit comments