From 1b445cb955c8552aa4cac8bc66cde772b0688e28 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 22 Dec 2020 04:31:39 +0000
Subject: [PATCH 1/4] Always try to read sample names from header

---
 deTiN/deTiN_utilities.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/deTiN/deTiN_utilities.py b/deTiN/deTiN_utilities.py
index c724130..d2008c6 100644
--- a/deTiN/deTiN_utilities.py
+++ b/deTiN/deTiN_utilities.py
@@ -513,8 +513,17 @@ def read_indel_vcf(vcf,seg_table,indel_type):
         with open(vcf) as f:
             content = f.readlines()
 
+    # if the sample names were not present in the header (this VCF is technically
+    # malformed), then fallback to generic "normal" and "tumor"
+    normal_sample = "normal"
+    tumor_sample = "tumor"
     cols_type = {0: str}
     for line in content:
+        # if we can get the sample names from the header, use them instead
+        if line[0:15] == '##normal_sample':
+            normal_sample = line.split('=')[1][0:-1]
+        if line[0:14] == '##tumor_sample':
+            tumor_sample = line.split('=')[1][0:-1]
         if line[0] == '#' and line[1] != '#':
             headerline = line.split('\t')
             break
@@ -533,13 +542,6 @@ def read_indel_vcf(vcf,seg_table,indel_type):
     elif indel_type.lower() == 'mutect2':
         indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type)
         # CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	TUMOR	NORMAL
-        normal_sample = 'normal'
-        tumor_sample = 'tumor'
-        for line in content:
-            if line[0:15] == '##normal_sample':
-                normal_sample = line.split('=')[1][0:-1]
-            if line[0:14] == '##tumor_sample':
-                tumor_sample = line.split('=')[1][0:-1]
         if tumor_sample == 'tumor' and normal_sample == 'normal':
             indel_table.rename(
                 columns={0: 'contig', 1: 'position', 2: 'ID', 3: 'REF', 4: 'ALT', 5: 'QUAL', 7: 'INFO', 8: 'format',

From 2e1dd7d5b8f835a90046fd45b6238304145dc5dd Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 22 Dec 2020 04:48:02 +0000
Subject: [PATCH 2/4] rstrip header line; only need read_csv logic once

---
 deTiN/deTiN_utilities.py | 47 +++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 30 deletions(-)

diff --git a/deTiN/deTiN_utilities.py b/deTiN/deTiN_utilities.py
index d2008c6..ce4f419 100644
--- a/deTiN/deTiN_utilities.py
+++ b/deTiN/deTiN_utilities.py
@@ -525,13 +525,25 @@ def read_indel_vcf(vcf,seg_table,indel_type):
         if line[0:14] == '##tumor_sample':
             tumor_sample = line.split('=')[1][0:-1]
         if line[0] == '#' and line[1] != '#':
-            headerline = line.split('\t')
+            headerline = line.rstrip().split('\t')
             break
 
+    indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type)
+    indel_table.rename(columns = {
+      0: 'contig',
+      1: 'position',
+      2: 'ID',
+      3: 'REF',
+      4: 'ALT',
+      5: 'QUAL',
+      6: 'filter',
+      7: 'INFO',
+      8: 'format',
+      9: headerline[9],
+     10: headerline[10]
+    }, inplace = True)
+
     if indel_type.lower() == 'strelka':
-        indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type)
-        indel_table.rename(columns={0: 'contig', 1: 'position',2:'ID',3:'REF',4:'ALT',5:'QUAL',7:'INFO', 8: 'format', 6: 'filter', 9: headerline[9].lower(), 10: headerline[10][0:-1].lower()},
-                       inplace=True)
         counts_format = indel_table['format'][0].split(':')
         depth_ix = counts_format.index('DP')
         alt_indel_ix = counts_format.index('TIR')
@@ -540,37 +552,12 @@ def read_indel_vcf(vcf,seg_table,indel_type):
         indel_table.reset_index(inplace=True, drop=True)
 
     elif indel_type.lower() == 'mutect2':
-        indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type)
-        # CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	TUMOR	NORMAL
-        if tumor_sample == 'tumor' and normal_sample == 'normal':
-            indel_table.rename(
-                columns={0: 'contig', 1: 'position', 2: 'ID', 3: 'REF', 4: 'ALT', 5: 'QUAL', 7: 'INFO', 8: 'format',
-                         6: 'filter', 9: 'tumor', 10: 'normal'},
-                inplace=True)
-        else:
-            if tumor_sample == headerline[9]:
-                indel_table.rename(
-                        columns={0: 'contig', 1: 'position', 2: 'ID', 3: 'REF', 4: 'ALT', 5: 'QUAL', 7: 'INFO', 8: 'format',
-                         6: 'filter', 9: 'tumor', 10: 'normal'},
-                        inplace=True)
-            elif tumor_sample == headerline[10][0:-1]:
-                indel_table.rename(
-                    columns={0: 'contig', 1: 'position', 2: 'ID', 3: 'REF', 4: 'ALT', 5: 'QUAL', 7: 'INFO', 8: 'format',
-                             6: 'filter', 9: 'normal', 10: 'tumor'},
-                    inplace=True)
-            else:
-                print('failed to read MuTect 2 indels VCF')
-                sys.exit()
         counts_format = indel_table['format'][0].split(':')
         depth_ix = counts_format.index('AD')
         indel_table = indel_table[np.isfinite(is_member(indel_table['filter'], ['PASS', 'alt_allele_in_normal','artifact_in_normal']))]
         indel_table.reset_index(inplace=True, drop=True)
 
     elif indel_type.lower() == 'sanger':
-        indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type)
-        # CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NORMAL  TUMOUR
-        indel_table.rename(columns={0: 'contig', 1: 'position',2:'ID',3:'REF',4:'ALT',5:'QUAL',7:'INFO',8: 'format', 6: 'filter', 9: headerline[9].lower(), 10: headerline[10][0:-1].lower()},
-                           inplace=True)
         b1 = np.logical_or.reduce([indel_table['filter'] == 'F012', indel_table['filter'] == 'F012;F015'])
         b2 = np.logical_or.reduce([indel_table['filter'] == 'PASS', indel_table['filter'] == 'F015'])
         indel_table = indel_table[np.logical_or.reduce([b1, b2])]
@@ -614,7 +601,7 @@ def read_indel_vcf(vcf,seg_table,indel_type):
             t_alt_count[index] = np.sum([int(spl_t[i]) for i in alt_count_idx])
             t_ref_count[index] = t_depth[index] - t_alt_count[index]
     if len(indel_table) == 0:
-        indel_table = pd.DataFrame(index=[0],columns=['contig', 'position','ID','REF','ALT','QUAL','INFO','format', 'filter',headerline[9].lower(), headerline[10][0:-1].lower(),
+        indel_table = pd.DataFrame(index=[0],columns=['contig', 'position','ID','REF','ALT','QUAL','INFO','format', 'filter',headerline[9], headerline[10],
                                                       't_depth','t_alt_count','t_ref_count','n_alt_count','n_depth','n_ref_count','tau','f_acs','Chromosome','genomic_coord_x'])
     else:
         indel_table['t_depth'] = t_alt_count + t_ref_count

From 5db7447d3f625f3362b5f6f1666c6e2c7f483e5e Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 22 Dec 2020 04:48:21 +0000
Subject: [PATCH 3/4] Dynamically index into header line

---
 deTiN/deTiN_utilities.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/deTiN/deTiN_utilities.py b/deTiN/deTiN_utilities.py
index ce4f419..7bfda11 100644
--- a/deTiN/deTiN_utilities.py
+++ b/deTiN/deTiN_utilities.py
@@ -543,6 +543,10 @@ def read_indel_vcf(vcf,seg_table,indel_type):
      10: headerline[10]
     }, inplace = True)
 
+    # determine which columns corresponds to the tumor/normal
+    t_samp_ix = indel_table.columns.get_loc(tumor_sample)
+    n_samp_ix = indel_table.columns.get_loc(normal_sample)
+
     if indel_type.lower() == 'strelka':
         counts_format = indel_table['format'][0].split(':')
         depth_ix = counts_format.index('DP')
@@ -577,8 +581,8 @@ def read_indel_vcf(vcf,seg_table,indel_type):
     t_ref_count = np.zeros([len(indel_table), 1])
 
     for index, row in indel_table.iterrows():
-        spl_n = row['normal'].split(':')
-        spl_t = row['tumor'].split(':')
+        spl_n = row.iloc[n_samp_ix].split(':')
+        spl_t = row.iloc[t_samp_ix].split(':')
         if indel_type.lower() == 'strelka':
             n_depth[index] = int(spl_n[depth_ix])
             n_alt_count[index] = int(spl_n[alt_indel_ix].split(',')[0])

From 8515c6175722bcf48ad8e14645a988b089f390b2 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 22 Dec 2020 04:55:42 +0000
Subject: [PATCH 4/4] Make column inference a bit more robust

---
 deTiN/deTiN_utilities.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/deTiN/deTiN_utilities.py b/deTiN/deTiN_utilities.py
index 7bfda11..2458c68 100644
--- a/deTiN/deTiN_utilities.py
+++ b/deTiN/deTiN_utilities.py
@@ -544,8 +544,23 @@ def read_indel_vcf(vcf,seg_table,indel_type):
     }, inplace = True)
 
     # determine which columns corresponds to the tumor/normal
-    t_samp_ix = indel_table.columns.get_loc(tumor_sample)
-    n_samp_ix = indel_table.columns.get_loc(normal_sample)
+    # if the sample names don't match the column names, we assume it's because
+    # we set it to generic "tumor"/"normal" (lowercase) but the column name is
+    # capitalized. if this assumption isn't satisfied, then we fail.
+    try:
+        t_samp_ix = indel_table.columns.get_loc(tumor_sample)
+    except KeyError:
+        if "TUMOR" in indel_table.columns:
+            t_samp_ix = indel_table.columns.get_loc("TUMOR")
+        else:
+            raise KeyError("Could not infer which VCF column corresponds to the tumor!")
+    try:
+        n_samp_ix = indel_table.columns.get_loc(normal_sample)
+    except KeyError:
+        if "NORMAL" in indel_table.columns:
+            n_samp_ix = indel_table.columns.get_loc("NORMAL")
+        else:
+            raise KeyError("Could not infer which VCF column corresponds to the normal!")
 
     if indel_type.lower() == 'strelka':
         counts_format = indel_table['format'][0].split(':')