diff --git a/scripts/leafcutter_cluster_regtools_py3.py b/scripts/leafcutter_cluster_regtools_py3.py
new file mode 100644
index 0000000..86511c6
--- /dev/null
+++ b/scripts/leafcutter_cluster_regtools_py3.py
@@ -0,0 +1,522 @@
+
+# written by Yang Li for the Leafcutter repo 
+# forked by mdshw5 and converted to Python3 
+# https://github.com/mdshw5/leafcutter/blob/master/scripts/leafcutter_cluster_regtools.py
+
+# requires regtools installation
+
+# https://github.com/griffithlab/regtools
+# /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc
+# Using regtools speeds up the junction extraction step by an order of magnitude or more
+
+import sys
+import tempfile
+import os
+import gzip
+import shutil
+
+def main(options,libl):
+
+    if options.cluster == None:
+        pool_junc_reads(libl, options)
+        refine_clusters(options)
+
+    sort_junctions(libl, options)
+    merge_junctions(options)
+    get_numers(options)
+
+def pool_junc_reads(flist, options):
+
+    outPrefix = options.outprefix
+    rundir = options.rundir
+    maxIntronLen = int(options.maxintronlen)
+    checkchrom = options.checkchrom
+
+    outFile = "%s/%s_pooled"%(rundir,outPrefix)
+
+    chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
+    by_chrom = {}
+    for libl in flist:
+
+        lib = libl.strip()
+        if not os.path.isfile(lib):
+            continue
+
+        if options.verbose:
+            sys.stderr.write("scanning %s...\n"%lib)
+
+        for ln in open(lib):
+
+            lnsplit=ln.split()
+            if len(lnsplit)<6:
+                sys.stderr.write("Error in %s \n" % lib)
+                continue
+            chrom, A, B, dot, counts, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit
+            if int(blockCount) > 2:
+                print(ln)
+                continue
+
+            if checkchrom and (chrom not in chromLst): continue
+            Aoff, Boff = blockSize.split(",")
+            A, B = int(A)+int(Aoff), int(B)-int(Boff)+1
+            if B-A > int(maxIntronLen): continue
+            try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,strand)][(A,B)]
+            except:
+                try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
+                except: by_chrom[(chrom,strand)] = {(A,B):int(counts)}
+
+    fout = open(outFile, 'w')
+    Ncluster = 0
+    sys.stderr.write("Parsing...\n")
+    #print("HOW MANY CHROMOSOMES ARE THERE ANYWAY?")
+    #print(by_chrom)
+    #print("there are %d elements in by_chrom"%len(by_chrom))
+    for chrom in by_chrom:
+        read_ks = [k for k,v in list(by_chrom[chrom].items()) if v >= 3] # a junction must have at least 3 reads
+        read_ks.sort()
+        sys.stderr.write("%s:%s.."%chrom)
+        if len(read_ks) == 0:
+             continue # weird test case for toy data with only 1 gene - two chroms but one is empty after filtering
+        #print("LOOK HERE BOYO")
+        #print(read_ks)
+        clu = cluster_intervals(read_ks)[0]
+        for cl in clu:
+            if len(cl) > 1: # if cluster has more than one intron
+                buf = '%s:%s '%chrom
+                for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
+                    buf += "%d:%d" % interval + ":%d"%count+ " "
+                fout.write(buf+'\n')
+            Ncluster += 1
+    sys.stderr.write("\nWrote %d clusters..\n"%Ncluster)
+    fout.close()
+
+
+def sort_junctions(libl, options):
+
+    chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
+    outPrefix = options.outprefix
+    rundir = options.rundir
+    checkchrom = options.checkchrom
+
+    if options.cluster == None:
+        refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
+    else:
+        refined_cluster = options.cluster
+    runName = "%s/%s"%(rundir, outPrefix)
+
+
+    exons, cluExons = {}, {}
+    cluN = 0
+
+    for ln in open(refined_cluster):
+        chrom = ln.split()[0]
+        cluN += 1
+        for exon in ln.split()[1:]:
+            A, B, count = exon.split(":")
+            if chrom not in exons:
+                exons[chrom] = {}
+            exons[chrom][(int(A),int(B))] = cluN
+            if cluN not in cluExons:
+                cluExons[cluN] = []
+            cluExons[cluN].append((chrom, A, B))
+
+    merges = {}
+    for ll in libl:
+        lib=ll.rstrip()
+        if not os.path.isfile(lib):
+            continue
+        libN = lib
+        if libN not in merges:
+            merges[libN] = []
+        merges[libN].append(lib)
+
+    fout_runlibs = open(runName+"_sortedlibs",'w')
+
+    for libN in merges:
+        libName = "%s/%s"%(rundir,libN.split('/')[-1])
+        by_chrom = {}
+        foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1])
+
+        fout_runlibs.write(foutName+'\n')
+
+        if options.verbose:
+            sys.stderr.write("Sorting %s..\n"%libN)
+        if len(merges[libN]) > 1:
+            if options.verbose:
+                sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
+        else:
+            pass
+        
+        header_string = "chrom %s\n"%libN.split("/")[-1].split(".junc")[0]
+        
+        fout = gzip.open(foutName, 'wb')
+
+        fout.write(header_string.encode('utf-8') )
+	#fout = gzip.open(foutName,'wb')
+
+        #fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
+
+        for lib in merges[libN]:
+
+            for ln in open(lib):
+
+                lnsplit=ln.split()
+                if len(lnsplit)<6:
+                    sys.stderr.write("Error in %s \n" % lib)
+                    continue
+                chrom, A, B, dot, count, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit
+                if int(blockCount) > 2:
+                    print(ln)
+                    continue
+
+                if checkchrom and (chrom not in chromLst): continue
+                Aoff, Boff = blockSize.split(",")
+                A, B = int(A)+int(Aoff), int(B)-int(Boff)+1
+                chrom = (chrom,strand)
+                if chrom not in by_chrom:
+                    by_chrom[chrom] = {}
+                intron = (A, B)
+
+                if intron in by_chrom[chrom]:
+                    by_chrom[chrom][intron] += int(count)
+                else:
+                    by_chrom[chrom][intron] = int(count)
+
+        for clu in cluExons:
+            buf = []
+            ks = cluExons[clu]
+            ks.sort()
+            tot = 0
+            for exon in ks:
+                chrom, start, end = exon
+                chrom = tuple(chrom.split(":"))
+                start, end = int(start), int(end)
+                if chrom not in by_chrom:
+                    pass
+                elif (start,end) in by_chrom[chrom]:
+                    tot += by_chrom[chrom][(start,end)]
+            for exon in ks:
+
+                chrom, start, end = exon
+                start, end = int(start), int(end)
+                chrom = tuple(chrom.split(":"))
+                chromID, strand = chrom
+                if chrom not in by_chrom:
+                    buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
+                elif (start,end) in by_chrom[chrom]:
+                    buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
+                else:
+                    buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
+
+            fout.write("".join(buf).encode('utf-8') )
+        fout.close()
+    fout_runlibs.close()
+
+def refine_clusters(options):
+
+    outPrefix = options.outprefix
+    rundir = options.rundir
+    minratio = float(options.mincluratio)
+    minreads = int(options.minclureads)
+
+    inFile = "%s/%s_pooled"%(rundir,outPrefix)
+    outFile = "%s/%s_refined"%(rundir,outPrefix)
+
+    fout = open(outFile,'w')
+    Ncl = 0
+    for ln in open(inFile):
+        clu = []
+        totN = 0
+        chrom = ln.split()[0]
+        for ex in ln.split()[1:]:
+            A, B, N = ex.split(":")
+            clu.append(((int(A),int(B)), int(N)))
+            totN += int(N)
+        if totN < minreads: continue
+        #print "CLU",clu
+        #print "linked",refine_linked(clu)
+        #print '\n\n'
+
+        for cl in refine_linked(clu):
+            rc = refine_cluster(cl,minratio, minreads)
+            if len(rc) > 0:
+                for clu in rc:
+                    buf = '%s ' % chrom
+                    for interval, count in clu:
+                        buf += "%d:%d" % interval + ":%d"%(count)+ " "
+                    Ncl += 1
+                    fout.write(buf+'\n')
+    sys.stderr.write("Split into %s clusters...\n"%Ncl)
+    fout.close()
+
+
+def merge_junctions(options):
+    ''' function to merge junctions '''
+
+    outPrefix = options.outprefix
+    rundir = options.rundir
+    fnameout = "%s/%s"%(rundir,outPrefix)
+    flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
+
+    lsts = []
+    for ln in open(flist):
+        lsts.append(ln.strip())
+    if options.verbose:
+        sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
+
+    # Change 300 if max open file is < 300
+    N = min([300, max([100, int(len(lsts)**(0.5))])])
+
+    tmpfiles = []
+    while len(lsts) > 1:
+        clst = []
+
+        for i in range(0,(len(lsts)//N)+1):
+            lst = lsts[N*i:N*(i+1)]
+            if len(lst) > 0:
+                clst.append(lst)
+        lsts = []
+
+        for lst in clst:
+            if len(lst) == 0: continue
+            tmpfile = tempfile.mktemp()
+            os.mkdir(tmpfile)
+            foutname = tmpfile+"/tmpmerge.gz"
+            fout = gzip.open(foutname,'w')
+
+            merge_files(lst, fout, options)
+            lsts.append(foutname)
+            tmpfiles.append(foutname)
+            fout.close()
+
+    shutil.move(lsts[0], fnameout+"_perind.counts.gz")
+
+def merge_files(fnames, fout, options):
+
+    fopen = []
+    for fname in fnames:
+        if fname[-3:] == ".gz":
+            fopen.append(gzip.open(fname, "rt")) # rt mode opens gzipped file as text file
+        else:
+            fopen.append(open(fname))
+
+    finished = False
+    N = 0
+    while not finished:
+        N += 1
+        if N % 50000 == 0:
+            sys.stderr.write(".")
+        buf = []
+        for f in fopen:
+            ln = f.readline().split()
+            if len(ln) == 0:
+                finished = True
+                break
+            chrom = ln[0]
+            data = ln[1:]
+            if len(buf) == 0:
+                buf.append(chrom)
+            buf += data
+        if len(buf) > 0:
+            if buf[0] == "chrom":
+                if options.verbose:
+                    sys.stderr.write("merging %d files"%(len(buf)-1))
+            
+            # problematic line - buf is stored as bytes rather than a string
+            out_string = " ".join(buf)+'\n'
+            fout.write(out_string.encode('utf-8') )
+        else:
+            break
+
+    sys.stderr.write(" done.\n")
+    for fin in fopen:
+        fin.close()
+
+
+def cluster_intervals(E):
+    ''' Clusters intervals together. '''
+    E.sort()
+    #print(len(E))
+    current = E[0]
+    Eclusters, cluster = [], []
+
+    i = 0
+    while i < len(E):
+
+        if overlaps(E[i], current):
+            cluster.append(E[i])
+        else:
+            Eclusters.append(cluster)
+            cluster = [E[i]]
+        current = (E[i][0], max([current[1], E[i][1]]))
+        i += 1
+
+    if len(cluster) > 0:
+
+        Eclusters.append(cluster)
+
+
+    return Eclusters, E
+
+def overlaps(A,B):
+    '''
+    Checks if A and B overlaps
+    '''
+
+    if A[1] < B[0] or B[1] < A[0]:
+        return False
+    else: return True
+
+def refine_linked(clusters):
+
+    unassigned = [x for x in clusters[1:]]
+    current = [clusters[0]]
+    splicesites = set([current[0][0][0],current[0][0][1]])
+    newClusters = []
+    while len(unassigned) > 0:
+        finished = False
+
+        while not finished:
+            finished = True
+            torm = []
+            for intron in unassigned:
+                inter, count = intron
+                start, end = inter
+                if start in splicesites or end in splicesites:
+                    current.append(intron)
+                    splicesites.add(start)
+                    splicesites.add(end)
+                    finished = False
+                    torm.append(intron)
+            for intron in torm:
+                unassigned.remove(intron)
+        newClusters.append(current)
+        current = []
+        if len(unassigned) > 0:
+            current = [unassigned[0]]
+            splicesites = set([current[0][0][0],current[0][0][1]])
+            unassigned = unassigned[1:]
+    return newClusters
+
+
+def refine_cluster(clu, cutoff, readcutoff):
+    ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
+    remove and recluster '''
+
+    remove = []
+    dic = {}
+    intervals = []
+
+    reCLU = False
+    totN = 0
+
+    for inter, count in clu:
+        totN += count
+    for inter, count in clu:
+        if (count/float(totN) >= cutoff and count >= readcutoff):
+            intervals.append(inter)
+            dic[inter] = count
+        else:
+            reCLU = True
+    if len(intervals) == 0: return []
+
+    # This makes sure that after trimming, the clusters are still good
+    Atmp, B = cluster_intervals(intervals)
+    A = []
+    for cl in Atmp:
+        for c in refine_linked([(x,0) for x in cl]):
+            if len(c) > 0:
+                A.append([x[0] for x in c])
+
+    if len(A) == 1:
+        rc = [(x, dic[x]) for x in A[0]]
+        if len(rc) > 1:
+            if reCLU:
+                return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff)
+            else:
+                return [[(x, dic[x]) for x in A[0]]]
+        else:
+            return []
+    NCs = []
+    for c in A:
+        if len(c) > 1:
+            NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff)
+            NCs += NC
+    return NCs
+
+
+def get_numers(options):
+    outPrefix = options.outprefix
+    rundir = options.rundir
+    fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix)
+    fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix)
+    input_file=gzip.open(fname, 'rt') # read in as text
+    fout = gzip.open(fnameout,'w')
+    first_line=True
+
+    for l in input_file:
+        if first_line:
+            header_string = " ".join(l.strip().split(" ")[1:])+'\n'
+            fout.write(header_string.encode('utf-8')) # print the sample names
+            first_line=False
+        else:
+            l=l.strip()
+            words=l.split(" ")
+            words_string = words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n'
+            fout.write(words_string.encode('utf-8'))
+
+    input_file.close()
+    fout.close()
+
+if __name__ == "__main__":
+
+    from optparse import OptionParser
+
+    parser = OptionParser()
+
+    parser.add_option("-j", "--juncfiles", dest="juncfiles",
+                  help="text file with all junction files to be processed")
+
+    parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter',
+                  help="output prefix (default leafcutter)")
+
+    parser.add_option("-q", "--quiet",
+                  action="store_false", dest="verbose", default=True,
+                  help="don't print status messages to stdout")
+
+    parser.add_option("-r", "--rundir", dest="rundir", default='./',
+                  help="write to directory (default ./)")
+
+    parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
+                  help="maximum intron length in bp (default 100,000bp)")
+
+    parser.add_option("-m", "--minclureads", dest="minclureads", default = 30,
+                  help="minimum reads in a cluster (default 30 reads)")
+
+    parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001,
+                  help="minimum fraction of reads in a cluster that support a junction (default 0.001)")
+
+    parser.add_option("-c", "--cluster", dest="cluster", default = None,
+                  help="refined cluster file when clusters are already made")
+
+    parser.add_option("-k", "--checkchrom", dest="checkchrom", default = True,
+                  help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...")
+
+    (options, args) = parser.parse_args()
+
+    if options.juncfiles == None:
+        sys.stderr.write("Error: no junction file provided...\n")
+        exit(0)
+
+    # Get the junction file list
+    libl = []
+    for junc in open(options.juncfiles):
+        junc = junc.strip()
+        try:
+            open(junc)
+        except:
+            sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
+            exit(0)
+        libl.append(junc)
+
+    main(options, libl)
diff --git a/scripts/prepare_phenotype_table_py3.py b/scripts/prepare_phenotype_table_py3.py
new file mode 100644
index 0000000..c1a8d71
--- /dev/null
+++ b/scripts/prepare_phenotype_table_py3.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+# written by Yang Li
+# ported to python 3 by Jack Humphrey
+import sys
+import gzip
+import numpy as np
+import scipy as sc
+import pickle
+
+from optparse import OptionParser
+    
+from sklearn.decomposition import PCA
+from sklearn import preprocessing
+from sklearn import linear_model
+
+from scipy.stats import rankdata
+from scipy.stats import norm
+
+def qqnorm(x):
+    n=len(x)
+    a=3.0/8.0 if n<=10 else 0.5
+    return(norm.ppf( (rankdata(x)-a)/(n+1.0-2.0*a) ))
+
+def stream_table(f, ss = ''):
+    fc = '#'
+    while fc[0] == "#":
+        fc = f.readline().strip()
+        head = fc.split(ss)
+
+    for ln in f:
+        ln = ln.strip().split(ss)
+        attr = {}
+
+        for i in range(len(head)):
+            try: attr[head[i]] = ln[i]
+            except: break
+        yield attr
+
+def main(ratio_file, pcs=50):
+    
+    dic_pop, fout = {}, {}
+    try: open(ratio_file)
+    except: 
+        sys.stderr.write("Can't find %s..exiting\n"%(ratio_file))
+        return
+
+    sys.stderr.write("Starting...\n")
+    for i in range(1,23):
+        fout[i] = open(ratio_file+".phen_chr%d"%i,'w')
+        fout_ave = open(ratio_file+".ave",'w')
+    valRows, valRowsnn, geneRows = [], [], []
+    finished = False
+    # have to now specify text mode
+    header = gzip.open(ratio_file, mode = 'rt').readline().split()[1:]
+
+    # problematic line - something about py2->py3 and handling gzipped files
+    for i in fout:
+        header_string = "\t".join(["#Chr","start", "end", "ID"]+header)+'\n'
+        fout[i].write(header_string) 
+	#fout[i].write("\t".join(["#Chr","start", "end", "ID"]+header)+'\n')
+	
+    for dic in stream_table(gzip.open(ratio_file, mode = 'rt'),' '):
+
+        chrom = dic['chrom'].replace("chr",'')
+        chr_ = chrom.split(":")[0]
+        if chr_ in 'XY': continue
+        NA_indices, valRow, aveReads = [], [], []
+        tmpvalRow = []
+
+        i = 0
+        for sample in header:
+
+            try: count = dic[sample]
+            except: print(chrom, len(dic))
+            num, denom = count.split('/')
+            if float(denom) < 1:
+                count = "NA"
+                tmpvalRow.append("NA")
+                NA_indices.append(i)
+            else:
+                # add a 0.5 pseudocount
+                count = (float(num)+0.5)/((float(denom))+0.5)
+                tmpvalRow.append(count) 
+                aveReads.append(count)
+
+
+        # If ratio is missing for over 40% of the samples, skip
+        if tmpvalRow.count("NA") > len(tmpvalRow)*0.4:
+            continue
+
+        ave = np.mean(aveReads)
+
+        # Set missing values as the mean of all values
+        for c in tmpvalRow:
+            if c == "NA": valRow.append(ave)
+            else: valRow.append(c)
+
+        # If there is too little variation, skip (there is a bug in fastqtl which doesn't handle cases with no variation)
+        if np.std(valRow) < 0.005: continue
+
+        chr_, s, e, clu = chrom.split(":")
+        if len(valRow) > 0:                
+            chrom_int = int(chr_)
+            fout[chrom_int].write("\t".join([chr_,s,e,chrom]+[str(x) for x in valRow])+'\n')
+            fout_ave.write(" ".join(["%s"%chrom]+[str(min(aveReads)), str(max(aveReads)), str(np.mean(aveReads))])+'\n')
+
+            # scale normalize
+            valRowsnn.append(valRow)                
+            valRow = preprocessing.scale(valRow)
+
+            valRows.append(valRow)
+            geneRows.append("\t".join([chr_,s,e,chrom]))
+            if len(geneRows) % 1000 == 0:
+                sys.stderr.write("Parsed %s introns...\n"%len(geneRows))
+                
+    for i in fout:
+        fout[i].close()
+
+    # qqnorms on the columns
+    matrix = np.array(valRows)
+    for i in range(len(matrix[0,:])):
+        matrix[:,i] = qqnorm(matrix[:,i])
+        
+    # write the corrected tables
+    fout = {}
+    for i in range(1,23):
+        fn="%s.qqnorm_chr%d"%(ratio_file,i)
+        print(("Outputting: " + fn))
+        fout[i] = open(fn,'w')
+        fout[i].write("\t".join(['#Chr','start','end','ID'] + header)+'\n')
+    lst = []
+    for i in range(len(matrix)):
+        chrom, s = geneRows[i].split()[:2]
+        
+        lst.append((int(chrom.replace("chr","")), int(s), "\t".join([geneRows[i]] + [str(x) for x in  matrix[i]])+'\n'))
+
+    lst.sort()
+    for ln in lst:
+        fout[ln[0]].write(ln[2])
+        
+    fout_run = open("%s_prepare.sh"%ratio_file,'w')
+
+    for i in fout:
+        fout[i].close()
+        fout_run.write("bgzip -f %s.qqnorm_chr%d\n"%(ratio_file, i))
+        fout_run.write("tabix -p bed %s.qqnorm_chr%d.gz\n"%(ratio_file, i))
+    fout_run.close()
+
+    sys.stdout.write("Use `sh %s_prepare.sh' to create index for fastQTL (requires tabix and bgzip).\n"%ratio_file)
+
+    if pcs>0:
+        #matrix = np.transpose(matrix) # important bug fix (removed as of Jan 1 2018)
+        pcs = min([len(header), pcs])
+        pca = PCA(n_components=pcs)                                                                                                                                                                            
+        pca.fit(matrix)  
+        pca_fn=ratio_file+".PCs"
+        print(("Outputting PCs: " + pca_fn))
+        pcafile = open(pca_fn,'w')  
+        pcafile.write("\t".join(['id']+header)+'\n')
+        pcacomp = list(pca.components_)
+    
+        for i in range(len(pcacomp)):
+            pcafile.write("\t".join([str(i+1)]+[str(x) for x in pcacomp[i]])+'\n')
+
+        pcafile.close()
+
+if __name__ == "__main__":
+
+    parser = OptionParser(usage="usage: %prog [-p num_PCs] input_perind.counts.gz")
+    parser.add_option("-p", "--pcs", dest="npcs", default = 50, help="number of PCs output")
+    (options, args) = parser.parse_args()
+    if len(args)==0:
+        sys.stderr.write("Error: no ratio file provided... (e.g. python leafcutter/scripts/prepare_phenotype_table.py input_perind.counts.gz\n")
+        exit(0)
+    main(args[0], int(options.npcs) )
+