Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,19 @@
.Trashes
Icon?
ehthumbs.db
Thumbs.db
Thumbs.db

# Python compiled files #
#########################
**__pycache__
**.pyc
.pytest_cache

# Eggs #
########
.eggs
svtyper.egg-info

# Idea #
########
.idea
198 changes: 132 additions & 66 deletions scripts/sv_classifier.py

Large diffs are not rendered by default.

108 changes: 69 additions & 39 deletions scripts/update_info.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#!/usr/bin/env python

import pysam
import argparse, sys
import math, time, re
import argparse
import sys
import math
import time
import re
from collections import Counter
from argparse import RawTextHelpFormatter

Expand All @@ -13,13 +16,15 @@
# --------------------------------------
# define functions


def get_args():
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="\
update_info.py\n\
author: " + __author__ + "\n\
version: " + __version__ + "\n\
description: Update the PE SR fields after SVTyper")
parser.add_argument(metavar='vcf', dest='input_vcf', nargs='?', type=argparse.FileType('r'), default=None, help='VCF input (default: stdin)')
parser.add_argument(metavar='vcf', dest='input_vcf', nargs='?',
type=argparse.FileType('r'), default=None, help='VCF input (default: stdin)')

# parse the arguments
args = parser.parse_args()
Expand All @@ -34,7 +39,9 @@ def get_args():
# send back the user input
return args


class Vcf(object):

def __init__(self):
self.file_format = 'VCFv4.2'
# self.fasta = fasta
Expand All @@ -52,15 +59,15 @@ def add_header(self, header):
elif line.split('=')[0] == '##reference':
self.reference = line.rstrip().split('=')[1]
elif line.split('=')[0] == '##INFO':
a = line[line.find('<')+1:line.find('>')]
a = line[line.find('<') + 1:line.find('>')]
r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
self.add_info(*[b.split('=')[1] for b in r.findall(a)])
elif line.split('=')[0] == '##ALT':
a = line[line.find('<')+1:line.find('>')]
a = line[line.find('<') + 1:line.find('>')]
r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
self.add_alt(*[b.split('=')[1] for b in r.findall(a)])
elif line.split('=')[0] == '##FORMAT':
a = line[line.find('<')+1:line.find('>')]
a = line[line.find('<') + 1:line.find('>')]
r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
self.add_format(*[b.split('=')[1] for b in r.findall(a)])
elif line[0] == '#' and line[1] != '#':
Expand All @@ -71,10 +78,10 @@ def get_header(self, include_samples=True):
if include_samples:
header = '\n'.join(['##fileformat=' + self.file_format,
'##fileDate=' + time.strftime('%Y%m%d'),
'##reference=' + self.reference] + \
[i.hstring for i in self.info_list] + \
[a.hstring for a in self.alt_list] + \
[f.hstring for f in self.format_list] + \
'##reference=' + self.reference] +
[i.hstring for i in self.info_list] +
[a.hstring for a in self.alt_list] +
[f.hstring for f in self.format_list] +
['\t'.join([
'#CHROM',
'POS',
Expand All @@ -84,16 +91,16 @@ def get_header(self, include_samples=True):
'QUAL',
'FILTER',
'INFO',
'FORMAT'] + \
self.sample_list
)])
'FORMAT'] +
self.sample_list
)])
else:
header = '\n'.join(['##fileformat=' + self.file_format,
'##fileDate=' + time.strftime('%Y%m%d'),
'##reference=' + self.reference] + \
[i.hstring for i in self.info_list] + \
[a.hstring for a in self.alt_list] + \
[f.hstring for f in self.format_list] + \
'##reference=' + self.reference] +
[i.hstring for i in self.info_list] +
[a.hstring for a in self.alt_list] +
[f.hstring for f in self.format_list] +
['\t'.join([
'#CHROM',
'POS',
Expand All @@ -103,7 +110,7 @@ def get_header(self, include_samples=True):
'QUAL',
'FILTER',
'INFO']
)])
)])
return header

def add_info(self, id, number, type, desc):
Expand Down Expand Up @@ -136,6 +143,7 @@ def sample_to_col(self, sample):
return self.sample_list.index(sample) + 9

class Info(object):

def __init__(self, id, number, type, desc):
self.id = str(id)
self.number = str(number)
Expand All @@ -144,18 +152,22 @@ def __init__(self, id, number, type, desc):
# strip the double quotes around the string if present
if self.desc.startswith('"') and self.desc.endswith('"'):
self.desc = self.desc[1:-1]
self.hstring = '##INFO=<ID=' + self.id + ',Number=' + self.number + ',Type=' + self.type + ',Description=\"' + self.desc + '\">'
self.hstring = '##INFO=<ID=' + self.id + ',Number=' + self.number + \
',Type=' + self.type + ',Description=\"' + self.desc + '\">'

class Alt(object):

def __init__(self, id, desc):
self.id = str(id)
self.desc = str(desc)
# strip the double quotes around the string if present
if self.desc.startswith('"') and self.desc.endswith('"'):
self.desc = self.desc[1:-1]
self.hstring = '##ALT=<ID=' + self.id + ',Description=\"' + self.desc + '\">'
self.hstring = '##ALT=<ID=' + self.id + \
',Description=\"' + self.desc + '\">'

class Format(object):

def __init__(self, id, number, type, desc):
self.id = str(id)
self.number = str(number)
Expand All @@ -164,9 +176,12 @@ def __init__(self, id, number, type, desc):
# strip the double quotes around the string if present
if self.desc.startswith('"') and self.desc.endswith('"'):
self.desc = self.desc[1:-1]
self.hstring = '##FORMAT=<ID=' + self.id + ',Number=' + self.number + ',Type=' + self.type + ',Description=\"' + self.desc + '\">'
self.hstring = '##FORMAT=<ID=' + self.id + ',Number=' + self.number + \
',Type=' + self.type + ',Description=\"' + self.desc + '\">'


class Variant(object):

def __init__(self, var_list, vcf):
self.chrom = var_list[0]
self.pos = int(var_list[1])
Expand All @@ -184,10 +199,11 @@ def __init__(self, var_list, vcf):
self.format_list = vcf.format_list
self.active_formats = list()
self.gts = dict()

# fill in empty sample genotypes
if len(var_list) < 8:
sys.stderr.write('\nError: VCF file must have at least 8 columns\n')
sys.stderr.write(
'\nError: VCF file must have at least 8 columns\n')
exit(1)
if len(var_list) < 9:
var_list.append("GT")
Expand All @@ -204,7 +220,8 @@ def __init__(self, var_list, vcf):
self.gts[s] = Genotype(self, s, './.')

self.info = dict()
i_split = [a.split('=') for a in var_list[7].split(';')] # temp list of split info column
i_split = [a.split('=')
for a in var_list[7].split(';')] # temp list of split info column
for i in i_split:
if len(i) == 1:
i.append(True)
Expand All @@ -214,7 +231,8 @@ def set_info(self, field, value):
if field in [i.id for i in self.info_list]:
self.info[field] = value
else:
sys.stderr.write('\nError: invalid INFO field, \"' + field + '\"\n')
sys.stderr.write(
'\nError: invalid INFO field, \"' + field + '\"\n')
exit(1)

def get_info(self, field):
Expand All @@ -223,11 +241,12 @@ def get_info(self, field):
def get_info_string(self):
i_list = list()
for info_field in self.info_list:
if info_field.id in self.info.keys():
if info_field.id in list(self.info.keys()):
if info_field.type == 'Flag':
i_list.append(info_field.id)
else:
i_list.append('%s=%s' % (info_field.id, self.info[info_field.id]))
i_list.append(
'%s=%s' % (info_field.id, self.info[info_field.id]))
return ';'.join(i_list)

def get_format_string(self):
Expand All @@ -241,10 +260,11 @@ def genotype(self, sample_name):
if sample_name in self.sample_list:
return self.gts[sample_name]
else:
sys.stderr.write('\nError: invalid sample name, \"' + sample_name + '\"\n')
sys.stderr.write(
'\nError: invalid sample name, \"' + sample_name + '\"\n')

def get_var_string(self):
s = '\t'.join(map(str,[
s = '\t'.join(map(str, [
self.chrom,
self.pos,
self.var_id,
Expand All @@ -254,11 +274,14 @@ def get_var_string(self):
self.filter,
self.get_info_string(),
self.get_format_string(),
'\t'.join(self.genotype(s).get_gt_string() for s in self.sample_list)
'\t'.join(self.genotype(s).get_gt_string()
for s in self.sample_list)
]))
return s


class Genotype(object):

def __init__(self, variant, sample_name, gt):
self.format = dict()
self.variant = variant
Expand All @@ -270,9 +293,11 @@ def set_format(self, field, value):
if field not in self.variant.active_formats:
self.variant.active_formats.append(field)
# sort it to be in the same order as the format_list in header
self.variant.active_formats.sort(key=lambda x: [f.id for f in self.variant.format_list].index(x))
self.variant.active_formats.sort(
key=lambda x: [f.id for f in self.variant.format_list].index(x))
else:
sys.stderr.write('\nError: invalid FORMAT field, \"' + field + '\"\n')
sys.stderr.write(
'\nError: invalid FORMAT field, \"' + field + '\"\n')
exit(1)

def get_format(self, field):
Expand All @@ -288,28 +313,32 @@ def get_gt_string(self):
g_list.append(self.format[f])
else:
g_list.append('.')
return ':'.join(map(str,g_list))
return ':'.join(map(str, g_list))

# primary function


def update_info(vcf_file):
in_header = True
header = []
breakend_dict = {} # cache to hold unmatched generic breakends for genotyping
breakend_dict = {}
# cache to hold unmatched generic breakends for genotyping
vcf = Vcf()
vcf_out = sys.stdout

# read input VCF
for line in vcf_file:
if in_header:
if line[0] == '#':
header.append(line)
header.append(line)
if line[1] != '#':
vcf_samples = line.rstrip().split('\t')[9:]
continue
else:
in_header = False
vcf.add_header(header)
vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')
vcf.add_info(
'MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')
vcf.remove_info('SNAME')

# write the output header
Expand Down Expand Up @@ -350,12 +379,13 @@ def update_info(vcf_file):

vcf_out.write(var.get_var_string() + '\n')
vcf_out.close()

return

# --------------------------------------
# main function


def main():
# parse the command line args
args = get_args()
Expand All @@ -370,6 +400,6 @@ def main():
if __name__ == '__main__':
try:
sys.exit(main())
except IOError, e:
except IOError as e:
if e.errno != 32: # ignore SIGPIPE
raise
raise
Loading