hall-lab · florealcab · Apr 18, 2018 · Apr 18, 2018 · Apr 18, 2018 · Apr 20, 2018
diff --git a/.gitignore b/.gitignore
@@ -37,4 +37,19 @@
 .Trashes
 Icon?
 ehthumbs.db
-Thumbs.db
+Thumbs.db
+
+# Python compiled files #
+#########################
+**__pycache__
+**.pyc
+.pytest_cache
+
+# Eggs #
+########
+.eggs
+svtyper.egg-info
+
+# Idea #
+########
+.idea
diff --git a/scripts/sv_classifier.py b/scripts/sv_classifier.py
diff --git a/scripts/update_info.py b/scripts/update_info.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python
 
 import pysam
-import argparse, sys
-import math, time, re
+import argparse
+import sys
+import math
+import time
+import re
 from collections import Counter
 from argparse import RawTextHelpFormatter
 
@@ -13,13 +16,15 @@
 # --------------------------------------
 # define functions
 
+
 def get_args():
     parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="\
 update_info.py\n\
 author: " + __author__ + "\n\
 version: " + __version__ + "\n\
 description: Update the PE SR fields after SVTyper")
-    parser.add_argument(metavar='vcf', dest='input_vcf', nargs='?', type=argparse.FileType('r'), default=None, help='VCF input (default: stdin)')
+    parser.add_argument(metavar='vcf', dest='input_vcf', nargs='?',
+                        type=argparse.FileType('r'), default=None, help='VCF input (default: stdin)')
 
     # parse the arguments
     args = parser.parse_args()
@@ -34,7 +39,9 @@ def get_args():
     # send back the user input
     return args
 
+
 class Vcf(object):
+
     def __init__(self):
         self.file_format = 'VCFv4.2'
         # self.fasta = fasta
@@ -52,15 +59,15 @@ def add_header(self, header):
             elif line.split('=')[0] == '##reference':
                 self.reference = line.rstrip().split('=')[1]
             elif line.split('=')[0] == '##INFO':
-                a = line[line.find('<')+1:line.find('>')]
+                a = line[line.find('<') + 1:line.find('>')]
                 r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
                 self.add_info(*[b.split('=')[1] for b in r.findall(a)])
             elif line.split('=')[0] == '##ALT':
-                a = line[line.find('<')+1:line.find('>')]
+                a = line[line.find('<') + 1:line.find('>')]
                 r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
                 self.add_alt(*[b.split('=')[1] for b in r.findall(a)])
             elif line.split('=')[0] == '##FORMAT':
-                a = line[line.find('<')+1:line.find('>')]
+                a = line[line.find('<') + 1:line.find('>')]
                 r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
                 self.add_format(*[b.split('=')[1] for b in r.findall(a)])
             elif line[0] == '#' and line[1] != '#':
@@ -71,10 +78,10 @@ def get_header(self, include_samples=True):
         if include_samples:
             header = '\n'.join(['##fileformat=' + self.file_format,
                                 '##fileDate=' + time.strftime('%Y%m%d'),
-                                '##reference=' + self.reference] + \
-                               [i.hstring for i in self.info_list] + \
-                               [a.hstring for a in self.alt_list] + \
-                               [f.hstring for f in self.format_list] + \
+                                '##reference=' + self.reference] +
+                               [i.hstring for i in self.info_list] +
+                               [a.hstring for a in self.alt_list] +
+                               [f.hstring for f in self.format_list] +
                                ['\t'.join([
                                    '#CHROM',
                                    'POS',
@@ -84,16 +91,16 @@ def get_header(self, include_samples=True):
                                    'QUAL',
                                    'FILTER',
                                    'INFO',
-                                   'FORMAT'] + \
-                                          self.sample_list
-                                      )])
+                                   'FORMAT'] +
+                                self.sample_list
+                                )])
         else:
             header = '\n'.join(['##fileformat=' + self.file_format,
                                 '##fileDate=' + time.strftime('%Y%m%d'),
-                                '##reference=' + self.reference] + \
-                               [i.hstring for i in self.info_list] + \
-                               [a.hstring for a in self.alt_list] + \
-                               [f.hstring for f in self.format_list] + \
+                                '##reference=' + self.reference] +
+                               [i.hstring for i in self.info_list] +
+                               [a.hstring for a in self.alt_list] +
+                               [f.hstring for f in self.format_list] +
                                ['\t'.join([
                                    '#CHROM',
                                    'POS',
@@ -103,7 +110,7 @@ def get_header(self, include_samples=True):
                                    'QUAL',
                                    'FILTER',
                                    'INFO']
-                                          )])
+                               )])
         return header
 
     def add_info(self, id, number, type, desc):
@@ -136,6 +143,7 @@ def sample_to_col(self, sample):
         return self.sample_list.index(sample) + 9
 
     class Info(object):
+
         def __init__(self, id, number, type, desc):
             self.id = str(id)
             self.number = str(number)
@@ -144,18 +152,22 @@ def __init__(self, id, number, type, desc):
             # strip the double quotes around the string if present
             if self.desc.startswith('"') and self.desc.endswith('"'):
                 self.desc = self.desc[1:-1]
-            self.hstring = '##INFO=<ID=' + self.id + ',Number=' + self.number + ',Type=' + self.type + ',Description=\"' + self.desc + '\">'
+            self.hstring = '##INFO=<ID=' + self.id + ',Number=' + self.number + \
+                ',Type=' + self.type + ',Description=\"' + self.desc + '\">'
 
     class Alt(object):
+
         def __init__(self, id, desc):
             self.id = str(id)
             self.desc = str(desc)
             # strip the double quotes around the string if present
             if self.desc.startswith('"') and self.desc.endswith('"'):
                 self.desc = self.desc[1:-1]
-            self.hstring = '##ALT=<ID=' + self.id + ',Description=\"' + self.desc + '\">'
+            self.hstring = '##ALT=<ID=' + self.id + \
+                ',Description=\"' + self.desc + '\">'
 
     class Format(object):
+
         def __init__(self, id, number, type, desc):
             self.id = str(id)
             self.number = str(number)
@@ -164,9 +176,12 @@ def __init__(self, id, number, type, desc):
             # strip the double quotes around the string if present
             if self.desc.startswith('"') and self.desc.endswith('"'):
                 self.desc = self.desc[1:-1]
-            self.hstring = '##FORMAT=<ID=' + self.id + ',Number=' + self.number + ',Type=' + self.type + ',Description=\"' + self.desc + '\">'
+            self.hstring = '##FORMAT=<ID=' + self.id + ',Number=' + self.number + \
+                ',Type=' + self.type + ',Description=\"' + self.desc + '\">'
+
 
 class Variant(object):
+
     def __init__(self, var_list, vcf):
         self.chrom = var_list[0]
         self.pos = int(var_list[1])
@@ -184,10 +199,11 @@ def __init__(self, var_list, vcf):
         self.format_list = vcf.format_list
         self.active_formats = list()
         self.gts = dict()
-        
+
         # fill in empty sample genotypes
         if len(var_list) < 8:
-            sys.stderr.write('\nError: VCF file must have at least 8 columns\n')
+            sys.stderr.write(
+                '\nError: VCF file must have at least 8 columns\n')
             exit(1)
         if len(var_list) < 9:
             var_list.append("GT")
@@ -204,7 +220,8 @@ def __init__(self, var_list, vcf):
                 self.gts[s] = Genotype(self, s, './.')
 
         self.info = dict()
-        i_split = [a.split('=') for a in var_list[7].split(';')] # temp list of split info column
+        i_split = [a.split('=')
+                   for a in var_list[7].split(';')]  # temp list of split info column
         for i in i_split:
             if len(i) == 1:
                 i.append(True)
@@ -214,7 +231,8 @@ def set_info(self, field, value):
         if field in [i.id for i in self.info_list]:
             self.info[field] = value
         else:
-            sys.stderr.write('\nError: invalid INFO field, \"' + field + '\"\n')
+            sys.stderr.write(
+                '\nError: invalid INFO field, \"' + field + '\"\n')
             exit(1)
 
     def get_info(self, field):
@@ -223,11 +241,12 @@ def get_info(self, field):
     def get_info_string(self):
         i_list = list()
         for info_field in self.info_list:
-            if info_field.id in self.info.keys():
+            if info_field.id in list(self.info.keys()):
                 if info_field.type == 'Flag':
                     i_list.append(info_field.id)
                 else:
-                    i_list.append('%s=%s' % (info_field.id, self.info[info_field.id]))
+                    i_list.append(
+                        '%s=%s' % (info_field.id, self.info[info_field.id]))
         return ';'.join(i_list)
 
     def get_format_string(self):
@@ -241,10 +260,11 @@ def genotype(self, sample_name):
         if sample_name in self.sample_list:
             return self.gts[sample_name]
         else:
-            sys.stderr.write('\nError: invalid sample name, \"' + sample_name + '\"\n')
+            sys.stderr.write(
+                '\nError: invalid sample name, \"' + sample_name + '\"\n')
 
     def get_var_string(self):
-        s = '\t'.join(map(str,[
+        s = '\t'.join(map(str, [
             self.chrom,
             self.pos,
             self.var_id,
@@ -254,11 +274,14 @@ def get_var_string(self):
             self.filter,
             self.get_info_string(),
             self.get_format_string(),
-            '\t'.join(self.genotype(s).get_gt_string() for s in self.sample_list)
+            '\t'.join(self.genotype(s).get_gt_string()
+                      for s in self.sample_list)
         ]))
         return s
 
+
 class Genotype(object):
+
     def __init__(self, variant, sample_name, gt):
         self.format = dict()
         self.variant = variant
@@ -270,9 +293,11 @@ def set_format(self, field, value):
             if field not in self.variant.active_formats:
                 self.variant.active_formats.append(field)
                 # sort it to be in the same order as the format_list in header
-                self.variant.active_formats.sort(key=lambda x: [f.id for f in self.variant.format_list].index(x))
+                self.variant.active_formats.sort(
+                    key=lambda x: [f.id for f in self.variant.format_list].index(x))
         else:
-            sys.stderr.write('\nError: invalid FORMAT field, \"' + field + '\"\n')
+            sys.stderr.write(
+                '\nError: invalid FORMAT field, \"' + field + '\"\n')
             exit(1)
 
     def get_format(self, field):
@@ -288,28 +313,32 @@ def get_gt_string(self):
                     g_list.append(self.format[f])
             else:
                 g_list.append('.')
-        return ':'.join(map(str,g_list))
+        return ':'.join(map(str, g_list))
 
 # primary function
+
+
 def update_info(vcf_file):
     in_header = True
     header = []
-    breakend_dict = {} # cache to hold unmatched generic breakends for genotyping
+    breakend_dict = {}
+        # cache to hold unmatched generic breakends for genotyping
     vcf = Vcf()
     vcf_out = sys.stdout
 
     # read input VCF
     for line in vcf_file:
         if in_header:
             if line[0] == '#':
-                header.append(line) 
+                header.append(line)
                 if line[1] != '#':
                     vcf_samples = line.rstrip().split('\t')[9:]
                 continue
             else:
                 in_header = False
                 vcf.add_header(header)
-                vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')
+                vcf.add_info(
+                    'MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')
                 vcf.remove_info('SNAME')
 
                 # write the output header
@@ -350,12 +379,13 @@ def update_info(vcf_file):
 
         vcf_out.write(var.get_var_string() + '\n')
     vcf_out.close()
-    
+
     return
 
 # --------------------------------------
 # main function
 
+
 def main():
     # parse the command line args
     args = get_args()
@@ -370,6 +400,6 @@ def main():
 if __name__ == '__main__':
     try:
         sys.exit(main())
-    except IOError, e:
+    except IOError as e:
         if e.errno != 32:  # ignore SIGPIPE
-            raise 
+            raise