From 1b445cb955c8552aa4cac8bc66cde772b0688e28 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 22 Dec 2020 04:31:39 +0000 Subject: [PATCH 1/4] Always try to read sample names from header --- deTiN/deTiN_utilities.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/deTiN/deTiN_utilities.py b/deTiN/deTiN_utilities.py index c724130..d2008c6 100644 --- a/deTiN/deTiN_utilities.py +++ b/deTiN/deTiN_utilities.py @@ -513,8 +513,17 @@ def read_indel_vcf(vcf,seg_table,indel_type): with open(vcf) as f: content = f.readlines() + # if the sample names were not present in the header (this VCF is technically + # malformed), then fallback to generic "normal" and "tumor" + normal_sample = "normal" + tumor_sample = "tumor" cols_type = {0: str} for line in content: + # if we can get the sample names from the header, use them instead + if line[0:15] == '##normal_sample': + normal_sample = line.split('=')[1][0:-1] + if line[0:14] == '##tumor_sample': + tumor_sample = line.split('=')[1][0:-1] if line[0] == '#' and line[1] != '#': headerline = line.split('\t') break @@ -533,13 +542,6 @@ def read_indel_vcf(vcf,seg_table,indel_type): elif indel_type.lower() == 'mutect2': indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type) # CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TUMOR NORMAL - normal_sample = 'normal' - tumor_sample = 'tumor' - for line in content: - if line[0:15] == '##normal_sample': - normal_sample = line.split('=')[1][0:-1] - if line[0:14] == '##tumor_sample': - tumor_sample = line.split('=')[1][0:-1] if tumor_sample == 'tumor' and normal_sample == 'normal': indel_table.rename( columns={0: 'contig', 1: 'position', 2: 'ID', 3: 'REF', 4: 'ALT', 5: 'QUAL', 7: 'INFO', 8: 'format', From 2e1dd7d5b8f835a90046fd45b6238304145dc5dd Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 22 Dec 2020 04:48:02 +0000 Subject: [PATCH 2/4] rstrip header line; only need read_csv logic once --- deTiN/deTiN_utilities.py | 47 +++++++++++++++------------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/deTiN/deTiN_utilities.py b/deTiN/deTiN_utilities.py index d2008c6..ce4f419 100644 --- a/deTiN/deTiN_utilities.py +++ b/deTiN/deTiN_utilities.py @@ -525,13 +525,25 @@ def read_indel_vcf(vcf,seg_table,indel_type): if line[0:14] == '##tumor_sample': tumor_sample = line.split('=')[1][0:-1] if line[0] == '#' and line[1] != '#': - headerline = line.split('\t') + headerline = line.rstrip().split('\t') break + indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type) + indel_table.rename(columns = { + 0: 'contig', + 1: 'position', + 2: 'ID', + 3: 'REF', + 4: 'ALT', + 5: 'QUAL', + 6: 'filter', + 7: 'INFO', + 8: 'format', + 9: headerline[9], + 10: headerline[10] + }, inplace = True) + if indel_type.lower() == 'strelka': - indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type) - indel_table.rename(columns={0: 'contig', 1: 'position',2:'ID',3:'REF',4:'ALT',5:'QUAL',7:'INFO', 8: 'format', 6: 'filter', 9: headerline[9].lower(), 10: headerline[10][0:-1].lower()}, - inplace=True) counts_format = indel_table['format'][0].split(':') depth_ix = counts_format.index('DP') alt_indel_ix = counts_format.index('TIR') @@ -540,37 +552,12 @@ def read_indel_vcf(vcf,seg_table,indel_type): indel_table.reset_index(inplace=True, drop=True) elif indel_type.lower() == 'mutect2': - indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type) - # CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TUMOR NORMAL - if tumor_sample == 'tumor' and normal_sample == 'normal': - indel_table.rename( - columns={0: 'contig', 1: 'position', 2: 'ID', 3: 'REF', 4: 'ALT', 5: 'QUAL', 7: 'INFO', 8: 'format', - 6: 'filter', 9: 'tumor', 10: 'normal'}, - inplace=True) - else: - if tumor_sample == headerline[9]: - indel_table.rename( - columns={0: 'contig', 1: 'position', 2: 'ID', 3: 'REF', 4: 'ALT', 5: 'QUAL', 7: 'INFO', 8: 'format', - 6: 'filter', 9: 'tumor', 10: 'normal'}, - inplace=True) - elif tumor_sample == headerline[10][0:-1]: - indel_table.rename( - columns={0: 'contig', 1: 'position', 2: 'ID', 3: 'REF', 4: 'ALT', 5: 'QUAL', 7: 'INFO', 8: 'format', - 6: 'filter', 9: 'normal', 10: 'tumor'}, - inplace=True) - else: - print('failed to read MuTect 2 indels VCF') - sys.exit() counts_format = indel_table['format'][0].split(':') depth_ix = counts_format.index('AD') indel_table = indel_table[np.isfinite(is_member(indel_table['filter'], ['PASS', 'alt_allele_in_normal','artifact_in_normal']))] indel_table.reset_index(inplace=True, drop=True) elif indel_type.lower() == 'sanger': - indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type) - # CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOUR - indel_table.rename(columns={0: 'contig', 1: 'position',2:'ID',3:'REF',4:'ALT',5:'QUAL',7:'INFO',8: 'format', 6: 'filter', 9: headerline[9].lower(), 10: headerline[10][0:-1].lower()}, - inplace=True) b1 = np.logical_or.reduce([indel_table['filter'] == 'F012', indel_table['filter'] == 'F012;F015']) b2 = np.logical_or.reduce([indel_table['filter'] == 'PASS', indel_table['filter'] == 'F015']) indel_table = indel_table[np.logical_or.reduce([b1, b2])] @@ -614,7 +601,7 @@ def read_indel_vcf(vcf,seg_table,indel_type): t_alt_count[index] = np.sum([int(spl_t[i]) for i in alt_count_idx]) t_ref_count[index] = t_depth[index] - t_alt_count[index] if len(indel_table) == 0: - indel_table = pd.DataFrame(index=[0],columns=['contig', 'position','ID','REF','ALT','QUAL','INFO','format', 'filter',headerline[9].lower(), headerline[10][0:-1].lower(), + indel_table = pd.DataFrame(index=[0],columns=['contig', 'position','ID','REF','ALT','QUAL','INFO','format', 'filter',headerline[9], headerline[10], 't_depth','t_alt_count','t_ref_count','n_alt_count','n_depth','n_ref_count','tau','f_acs','Chromosome','genomic_coord_x']) else: indel_table['t_depth'] = t_alt_count + t_ref_count From 5db7447d3f625f3362b5f6f1666c6e2c7f483e5e Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 22 Dec 2020 04:48:21 +0000 Subject: [PATCH 3/4] Dynamically index into header line --- deTiN/deTiN_utilities.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/deTiN/deTiN_utilities.py b/deTiN/deTiN_utilities.py index ce4f419..7bfda11 100644 --- a/deTiN/deTiN_utilities.py +++ b/deTiN/deTiN_utilities.py @@ -543,6 +543,10 @@ def read_indel_vcf(vcf,seg_table,indel_type): 10: headerline[10] }, inplace = True) + # determine which columns corresponds to the tumor/normal + t_samp_ix = indel_table.columns.get_loc(tumor_sample) + n_samp_ix = indel_table.columns.get_loc(normal_sample) + if indel_type.lower() == 'strelka': counts_format = indel_table['format'][0].split(':') depth_ix = counts_format.index('DP') @@ -577,8 +581,8 @@ def read_indel_vcf(vcf,seg_table,indel_type): t_ref_count = np.zeros([len(indel_table), 1]) for index, row in indel_table.iterrows(): - spl_n = row['normal'].split(':') - spl_t = row['tumor'].split(':') + spl_n = row.iloc[n_samp_ix].split(':') + spl_t = row.iloc[t_samp_ix].split(':') if indel_type.lower() == 'strelka': n_depth[index] = int(spl_n[depth_ix]) n_alt_count[index] = int(spl_n[alt_indel_ix].split(',')[0]) From 8515c6175722bcf48ad8e14645a988b089f390b2 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 22 Dec 2020 04:55:42 +0000 Subject: [PATCH 4/4] Make column inference a bit more robust --- deTiN/deTiN_utilities.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/deTiN/deTiN_utilities.py b/deTiN/deTiN_utilities.py index 7bfda11..2458c68 100644 --- a/deTiN/deTiN_utilities.py +++ b/deTiN/deTiN_utilities.py @@ -544,8 +544,23 @@ def read_indel_vcf(vcf,seg_table,indel_type): }, inplace = True) # determine which columns corresponds to the tumor/normal - t_samp_ix = indel_table.columns.get_loc(tumor_sample) - n_samp_ix = indel_table.columns.get_loc(normal_sample) + # if the sample names don't match the column names, we assume it's because + # we set it to generic "tumor"/"normal" (lowercase) but the column name is + # capitalized. if this assumption isn't satisfied, then we fail. + try: + t_samp_ix = indel_table.columns.get_loc(tumor_sample) + except KeyError: + if "TUMOR" in indel_table.columns: + t_samp_ix = indel_table.columns.get_loc("TUMOR") + else: + raise KeyError("Could not infer which VCF column corresponds to the tumor!") + try: + n_samp_ix = indel_table.columns.get_loc(normal_sample) + except KeyError: + if "NORMAL" in indel_table.columns: + n_samp_ix = indel_table.columns.get_loc("NORMAL") + else: + raise KeyError("Could not infer which VCF column corresponds to the normal!") if indel_type.lower() == 'strelka': counts_format = indel_table['format'][0].split(':')