diff --git a/scripts/leafcutter_cluster_regtools_py3.py b/scripts/leafcutter_cluster_regtools_py3.py new file mode 100644 index 0000000..86511c6 --- /dev/null +++ b/scripts/leafcutter_cluster_regtools_py3.py @@ -0,0 +1,522 @@ + +# written by Yang Li for the Leafcutter repo +# forked by mdshw5 and converted to Python3 +# https://github.com/mdshw5/leafcutter/blob/master/scripts/leafcutter_cluster_regtools.py + +# requires regtools installation + +# https://github.com/griffithlab/regtools +# /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc +# Using regtools speeds up the junction extraction step by an order of magnitude or more + +import sys +import tempfile +import os +import gzip +import shutil + +def main(options,libl): + + if options.cluster == None: + pool_junc_reads(libl, options) + refine_clusters(options) + + sort_junctions(libl, options) + merge_junctions(options) + get_numers(options) + +def pool_junc_reads(flist, options): + + outPrefix = options.outprefix + rundir = options.rundir + maxIntronLen = int(options.maxintronlen) + checkchrom = options.checkchrom + + outFile = "%s/%s_pooled"%(rundir,outPrefix) + + chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] + by_chrom = {} + for libl in flist: + + lib = libl.strip() + if not os.path.isfile(lib): + continue + + if options.verbose: + sys.stderr.write("scanning %s...\n"%lib) + + for ln in open(lib): + + lnsplit=ln.split() + if len(lnsplit)<6: + sys.stderr.write("Error in %s \n" % lib) + continue + chrom, A, B, dot, counts, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit + if int(blockCount) > 2: + print(ln) + continue + + if checkchrom and (chrom not in chromLst): continue + Aoff, Boff = blockSize.split(",") + A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 + if B-A > int(maxIntronLen): continue + try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,strand)][(A,B)] + except: + try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + except: by_chrom[(chrom,strand)] = {(A,B):int(counts)} + + fout = open(outFile, 'w') + Ncluster = 0 + sys.stderr.write("Parsing...\n") + #print("HOW MANY CHROMOSOMES ARE THERE ANYWAY?") + #print(by_chrom) + #print("there are %d elements in by_chrom"%len(by_chrom)) + for chrom in by_chrom: + read_ks = [k for k,v in list(by_chrom[chrom].items()) if v >= 3] # a junction must have at least 3 reads + read_ks.sort() + sys.stderr.write("%s:%s.."%chrom) + if len(read_ks) == 0: + continue # weird test case for toy data with only 1 gene - two chroms but one is empty after filtering + #print("LOOK HERE BOYO") + #print(read_ks) + clu = cluster_intervals(read_ks)[0] + for cl in clu: + if len(cl) > 1: # if cluster has more than one intron + buf = '%s:%s '%chrom + for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: + buf += "%d:%d" % interval + ":%d"%count+ " " + fout.write(buf+'\n') + Ncluster += 1 + sys.stderr.write("\nWrote %d clusters..\n"%Ncluster) + fout.close() + + +def sort_junctions(libl, options): + + chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] + outPrefix = options.outprefix + rundir = options.rundir + checkchrom = options.checkchrom + + if options.cluster == None: + refined_cluster = "%s/%s_refined"%(rundir,outPrefix) + else: + refined_cluster = options.cluster + runName = "%s/%s"%(rundir, outPrefix) + + + exons, cluExons = {}, {} + cluN = 0 + + for ln in open(refined_cluster): + chrom = ln.split()[0] + cluN += 1 + for exon in ln.split()[1:]: + A, B, count = exon.split(":") + if chrom not in exons: + exons[chrom] = {} + exons[chrom][(int(A),int(B))] = cluN + if cluN not in cluExons: + cluExons[cluN] = [] + cluExons[cluN].append((chrom, A, B)) + + merges = {} + for ll in libl: + lib=ll.rstrip() + if not os.path.isfile(lib): + continue + libN = lib + if libN not in merges: + merges[libN] = [] + merges[libN].append(lib) + + fout_runlibs = open(runName+"_sortedlibs",'w') + + for libN in merges: + libName = "%s/%s"%(rundir,libN.split('/')[-1]) + by_chrom = {} + foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1]) + + fout_runlibs.write(foutName+'\n') + + if options.verbose: + sys.stderr.write("Sorting %s..\n"%libN) + if len(merges[libN]) > 1: + if options.verbose: + sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) + else: + pass + + header_string = "chrom %s\n"%libN.split("/")[-1].split(".junc")[0] + + fout = gzip.open(foutName, 'wb') + + fout.write(header_string.encode('utf-8') ) + #fout = gzip.open(foutName,'wb') + + #fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) + + for lib in merges[libN]: + + for ln in open(lib): + + lnsplit=ln.split() + if len(lnsplit)<6: + sys.stderr.write("Error in %s \n" % lib) + continue + chrom, A, B, dot, count, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit + if int(blockCount) > 2: + print(ln) + continue + + if checkchrom and (chrom not in chromLst): continue + Aoff, Boff = blockSize.split(",") + A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 + chrom = (chrom,strand) + if chrom not in by_chrom: + by_chrom[chrom] = {} + intron = (A, B) + + if intron in by_chrom[chrom]: + by_chrom[chrom][intron] += int(count) + else: + by_chrom[chrom][intron] = int(count) + + for clu in cluExons: + buf = [] + ks = cluExons[clu] + ks.sort() + tot = 0 + for exon in ks: + chrom, start, end = exon + chrom = tuple(chrom.split(":")) + start, end = int(start), int(end) + if chrom not in by_chrom: + pass + elif (start,end) in by_chrom[chrom]: + tot += by_chrom[chrom][(start,end)] + for exon in ks: + + chrom, start, end = exon + start, end = int(start), int(end) + chrom = tuple(chrom.split(":")) + chromID, strand = chrom + if chrom not in by_chrom: + buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) + elif (start,end) in by_chrom[chrom]: + buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) + else: + buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) + + fout.write("".join(buf).encode('utf-8') ) + fout.close() + fout_runlibs.close() + +def refine_clusters(options): + + outPrefix = options.outprefix + rundir = options.rundir + minratio = float(options.mincluratio) + minreads = int(options.minclureads) + + inFile = "%s/%s_pooled"%(rundir,outPrefix) + outFile = "%s/%s_refined"%(rundir,outPrefix) + + fout = open(outFile,'w') + Ncl = 0 + for ln in open(inFile): + clu = [] + totN = 0 + chrom = ln.split()[0] + for ex in ln.split()[1:]: + A, B, N = ex.split(":") + clu.append(((int(A),int(B)), int(N))) + totN += int(N) + if totN < minreads: continue + #print "CLU",clu + #print "linked",refine_linked(clu) + #print '\n\n' + + for cl in refine_linked(clu): + rc = refine_cluster(cl,minratio, minreads) + if len(rc) > 0: + for clu in rc: + buf = '%s ' % chrom + for interval, count in clu: + buf += "%d:%d" % interval + ":%d"%(count)+ " " + Ncl += 1 + fout.write(buf+'\n') + sys.stderr.write("Split into %s clusters...\n"%Ncl) + fout.close() + + +def merge_junctions(options): + ''' function to merge junctions ''' + + outPrefix = options.outprefix + rundir = options.rundir + fnameout = "%s/%s"%(rundir,outPrefix) + flist = "%s/%s_sortedlibs"%(rundir, outPrefix) + + lsts = [] + for ln in open(flist): + lsts.append(ln.strip()) + if options.verbose: + sys.stderr.write("merging %d junction files...\n"%(len(lsts))) + + # Change 300 if max open file is < 300 + N = min([300, max([100, int(len(lsts)**(0.5))])]) + + tmpfiles = [] + while len(lsts) > 1: + clst = [] + + for i in range(0,(len(lsts)//N)+1): + lst = lsts[N*i:N*(i+1)] + if len(lst) > 0: + clst.append(lst) + lsts = [] + + for lst in clst: + if len(lst) == 0: continue + tmpfile = tempfile.mktemp() + os.mkdir(tmpfile) + foutname = tmpfile+"/tmpmerge.gz" + fout = gzip.open(foutname,'w') + + merge_files(lst, fout, options) + lsts.append(foutname) + tmpfiles.append(foutname) + fout.close() + + shutil.move(lsts[0], fnameout+"_perind.counts.gz") + +def merge_files(fnames, fout, options): + + fopen = [] + for fname in fnames: + if fname[-3:] == ".gz": + fopen.append(gzip.open(fname, "rt")) # rt mode opens gzipped file as text file + else: + fopen.append(open(fname)) + + finished = False + N = 0 + while not finished: + N += 1 + if N % 50000 == 0: + sys.stderr.write(".") + buf = [] + for f in fopen: + ln = f.readline().split() + if len(ln) == 0: + finished = True + break + chrom = ln[0] + data = ln[1:] + if len(buf) == 0: + buf.append(chrom) + buf += data + if len(buf) > 0: + if buf[0] == "chrom": + if options.verbose: + sys.stderr.write("merging %d files"%(len(buf)-1)) + + # problematic line - buf is stored as bytes rather than a string + out_string = " ".join(buf)+'\n' + fout.write(out_string.encode('utf-8') ) + else: + break + + sys.stderr.write(" done.\n") + for fin in fopen: + fin.close() + + +def cluster_intervals(E): + ''' Clusters intervals together. ''' + E.sort() + #print(len(E)) + current = E[0] + Eclusters, cluster = [], [] + + i = 0 + while i < len(E): + + if overlaps(E[i], current): + cluster.append(E[i]) + else: + Eclusters.append(cluster) + cluster = [E[i]] + current = (E[i][0], max([current[1], E[i][1]])) + i += 1 + + if len(cluster) > 0: + + Eclusters.append(cluster) + + + return Eclusters, E + +def overlaps(A,B): + ''' + Checks if A and B overlaps + ''' + + if A[1] < B[0] or B[1] < A[0]: + return False + else: return True + +def refine_linked(clusters): + + unassigned = [x for x in clusters[1:]] + current = [clusters[0]] + splicesites = set([current[0][0][0],current[0][0][1]]) + newClusters = [] + while len(unassigned) > 0: + finished = False + + while not finished: + finished = True + torm = [] + for intron in unassigned: + inter, count = intron + start, end = inter + if start in splicesites or end in splicesites: + current.append(intron) + splicesites.add(start) + splicesites.add(end) + finished = False + torm.append(intron) + for intron in torm: + unassigned.remove(intron) + newClusters.append(current) + current = [] + if len(unassigned) > 0: + current = [unassigned[0]] + splicesites = set([current[0][0][0],current[0][0][1]]) + unassigned = unassigned[1:] + return newClusters + + +def refine_cluster(clu, cutoff, readcutoff): + ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, + remove and recluster ''' + + remove = [] + dic = {} + intervals = [] + + reCLU = False + totN = 0 + + for inter, count in clu: + totN += count + for inter, count in clu: + if (count/float(totN) >= cutoff and count >= readcutoff): + intervals.append(inter) + dic[inter] = count + else: + reCLU = True + if len(intervals) == 0: return [] + + # This makes sure that after trimming, the clusters are still good + Atmp, B = cluster_intervals(intervals) + A = [] + for cl in Atmp: + for c in refine_linked([(x,0) for x in cl]): + if len(c) > 0: + A.append([x[0] for x in c]) + + if len(A) == 1: + rc = [(x, dic[x]) for x in A[0]] + if len(rc) > 1: + if reCLU: + return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff) + else: + return [[(x, dic[x]) for x in A[0]]] + else: + return [] + NCs = [] + for c in A: + if len(c) > 1: + NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff) + NCs += NC + return NCs + + +def get_numers(options): + outPrefix = options.outprefix + rundir = options.rundir + fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix) + fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix) + input_file=gzip.open(fname, 'rt') # read in as text + fout = gzip.open(fnameout,'w') + first_line=True + + for l in input_file: + if first_line: + header_string = " ".join(l.strip().split(" ")[1:])+'\n' + fout.write(header_string.encode('utf-8')) # print the sample names + first_line=False + else: + l=l.strip() + words=l.split(" ") + words_string = words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n' + fout.write(words_string.encode('utf-8')) + + input_file.close() + fout.close() + +if __name__ == "__main__": + + from optparse import OptionParser + + parser = OptionParser() + + parser.add_option("-j", "--juncfiles", dest="juncfiles", + help="text file with all junction files to be processed") + + parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter', + help="output prefix (default leafcutter)") + + parser.add_option("-q", "--quiet", + action="store_false", dest="verbose", default=True, + help="don't print status messages to stdout") + + parser.add_option("-r", "--rundir", dest="rundir", default='./', + help="write to directory (default ./)") + + parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, + help="maximum intron length in bp (default 100,000bp)") + + parser.add_option("-m", "--minclureads", dest="minclureads", default = 30, + help="minimum reads in a cluster (default 30 reads)") + + parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001, + help="minimum fraction of reads in a cluster that support a junction (default 0.001)") + + parser.add_option("-c", "--cluster", dest="cluster", default = None, + help="refined cluster file when clusters are already made") + + parser.add_option("-k", "--checkchrom", dest="checkchrom", default = True, + help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...") + + (options, args) = parser.parse_args() + + if options.juncfiles == None: + sys.stderr.write("Error: no junction file provided...\n") + exit(0) + + # Get the junction file list + libl = [] + for junc in open(options.juncfiles): + junc = junc.strip() + try: + open(junc) + except: + sys.stderr.write("%s does not exist... check your junction files.\n"%junc) + exit(0) + libl.append(junc) + + main(options, libl) diff --git a/scripts/prepare_phenotype_table_py3.py b/scripts/prepare_phenotype_table_py3.py new file mode 100644 index 0000000..c1a8d71 --- /dev/null +++ b/scripts/prepare_phenotype_table_py3.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python +# written by Yang Li +# ported to python 3 by Jack Humphrey +import sys +import gzip +import numpy as np +import scipy as sc +import pickle + +from optparse import OptionParser + +from sklearn.decomposition import PCA +from sklearn import preprocessing +from sklearn import linear_model + +from scipy.stats import rankdata +from scipy.stats import norm + +def qqnorm(x): + n=len(x) + a=3.0/8.0 if n<=10 else 0.5 + return(norm.ppf( (rankdata(x)-a)/(n+1.0-2.0*a) )) + +def stream_table(f, ss = ''): + fc = '#' + while fc[0] == "#": + fc = f.readline().strip() + head = fc.split(ss) + + for ln in f: + ln = ln.strip().split(ss) + attr = {} + + for i in range(len(head)): + try: attr[head[i]] = ln[i] + except: break + yield attr + +def main(ratio_file, pcs=50): + + dic_pop, fout = {}, {} + try: open(ratio_file) + except: + sys.stderr.write("Can't find %s..exiting\n"%(ratio_file)) + return + + sys.stderr.write("Starting...\n") + for i in range(1,23): + fout[i] = open(ratio_file+".phen_chr%d"%i,'w') + fout_ave = open(ratio_file+".ave",'w') + valRows, valRowsnn, geneRows = [], [], [] + finished = False + # have to now specify text mode + header = gzip.open(ratio_file, mode = 'rt').readline().split()[1:] + + # problematic line - something about py2->py3 and handling gzipped files + for i in fout: + header_string = "\t".join(["#Chr","start", "end", "ID"]+header)+'\n' + fout[i].write(header_string) + #fout[i].write("\t".join(["#Chr","start", "end", "ID"]+header)+'\n') + + for dic in stream_table(gzip.open(ratio_file, mode = 'rt'),' '): + + chrom = dic['chrom'].replace("chr",'') + chr_ = chrom.split(":")[0] + if chr_ in 'XY': continue + NA_indices, valRow, aveReads = [], [], [] + tmpvalRow = [] + + i = 0 + for sample in header: + + try: count = dic[sample] + except: print(chrom, len(dic)) + num, denom = count.split('/') + if float(denom) < 1: + count = "NA" + tmpvalRow.append("NA") + NA_indices.append(i) + else: + # add a 0.5 pseudocount + count = (float(num)+0.5)/((float(denom))+0.5) + tmpvalRow.append(count) + aveReads.append(count) + + + # If ratio is missing for over 40% of the samples, skip + if tmpvalRow.count("NA") > len(tmpvalRow)*0.4: + continue + + ave = np.mean(aveReads) + + # Set missing values as the mean of all values + for c in tmpvalRow: + if c == "NA": valRow.append(ave) + else: valRow.append(c) + + # If there is too little variation, skip (there is a bug in fastqtl which doesn't handle cases with no variation) + if np.std(valRow) < 0.005: continue + + chr_, s, e, clu = chrom.split(":") + if len(valRow) > 0: + chrom_int = int(chr_) + fout[chrom_int].write("\t".join([chr_,s,e,chrom]+[str(x) for x in valRow])+'\n') + fout_ave.write(" ".join(["%s"%chrom]+[str(min(aveReads)), str(max(aveReads)), str(np.mean(aveReads))])+'\n') + + # scale normalize + valRowsnn.append(valRow) + valRow = preprocessing.scale(valRow) + + valRows.append(valRow) + geneRows.append("\t".join([chr_,s,e,chrom])) + if len(geneRows) % 1000 == 0: + sys.stderr.write("Parsed %s introns...\n"%len(geneRows)) + + for i in fout: + fout[i].close() + + # qqnorms on the columns + matrix = np.array(valRows) + for i in range(len(matrix[0,:])): + matrix[:,i] = qqnorm(matrix[:,i]) + + # write the corrected tables + fout = {} + for i in range(1,23): + fn="%s.qqnorm_chr%d"%(ratio_file,i) + print(("Outputting: " + fn)) + fout[i] = open(fn,'w') + fout[i].write("\t".join(['#Chr','start','end','ID'] + header)+'\n') + lst = [] + for i in range(len(matrix)): + chrom, s = geneRows[i].split()[:2] + + lst.append((int(chrom.replace("chr","")), int(s), "\t".join([geneRows[i]] + [str(x) for x in matrix[i]])+'\n')) + + lst.sort() + for ln in lst: + fout[ln[0]].write(ln[2]) + + fout_run = open("%s_prepare.sh"%ratio_file,'w') + + for i in fout: + fout[i].close() + fout_run.write("bgzip -f %s.qqnorm_chr%d\n"%(ratio_file, i)) + fout_run.write("tabix -p bed %s.qqnorm_chr%d.gz\n"%(ratio_file, i)) + fout_run.close() + + sys.stdout.write("Use `sh %s_prepare.sh' to create index for fastQTL (requires tabix and bgzip).\n"%ratio_file) + + if pcs>0: + #matrix = np.transpose(matrix) # important bug fix (removed as of Jan 1 2018) + pcs = min([len(header), pcs]) + pca = PCA(n_components=pcs) + pca.fit(matrix) + pca_fn=ratio_file+".PCs" + print(("Outputting PCs: " + pca_fn)) + pcafile = open(pca_fn,'w') + pcafile.write("\t".join(['id']+header)+'\n') + pcacomp = list(pca.components_) + + for i in range(len(pcacomp)): + pcafile.write("\t".join([str(i+1)]+[str(x) for x in pcacomp[i]])+'\n') + + pcafile.close() + +if __name__ == "__main__": + + parser = OptionParser(usage="usage: %prog [-p num_PCs] input_perind.counts.gz") + parser.add_option("-p", "--pcs", dest="npcs", default = 50, help="number of PCs output") + (options, args) = parser.parse_args() + if len(args)==0: + sys.stderr.write("Error: no ratio file provided... (e.g. python leafcutter/scripts/prepare_phenotype_table.py input_perind.counts.gz\n") + exit(0) + main(args[0], int(options.npcs) ) +