-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsamplesheet_gen.py
More file actions
82 lines (72 loc) · 3.36 KB
/
samplesheet_gen.py
File metadata and controls
82 lines (72 loc) · 3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Searches for fastq files in a folder and groups them by a prefix regex. Combining these files
info a valid samplesheet, and joining potential samples sequenced over multiple lanes. This script
is specifically tuned for the naming conventions of NGI's sequencing facility. Your mileage may vary.
Usage:
python samplesheet_gen.py -p <prefix_regex> <in_folder> <out_samplesheet>
Input:
prefix_regex - Regex to group files by (default: "^P\\d+_\\d+")
in_folder - Folder containing fastq files
out_samplesheet - Samplesheet file
Output:
out_samplesheet - Samplesheet file in csv format with the following columns:
sample_name, read1, read2
"""
import sys
import os
import argparse
import re
import csv
from itertools import chain
r1_suffixes = ['_R1_', '_1.']
r2_suffixes = ['_R2_', '_2.']
pop_name = 'radQC'
def main(in_folder, prefix_regex, out_samplesheet):
# Get all fastq files in all subfolders
fastq_files = os.walk(in_folder)
fastq_files = [os.path.join(root, f) for root, _, files in fastq_files for f in files if f.lower().endswith(('.fastq.gz', '.fq.gz'))]
# Group files by prefix
# Find the basename first, then compile the regex and apply it to the basename.
class _BasenamePrefixMatcher:
def __init__(self, pattern):
self._pattern = pattern
def search(self, path):
basename = os.path.basename(path)
# compile after getting the basename (pattern stays the same)
return re.compile(self._pattern).search(basename)
prefix_re = _BasenamePrefixMatcher(prefix_regex)
prefix_groups = {}
for f in fastq_files:
m = prefix_re.search(f)
if m:
prefix = m.group(0)
if prefix not in prefix_groups.keys():
prefix_groups[prefix] = []
if any(s in f for s in r1_suffixes) or any(s in f for s in r2_suffixes):
prefix_groups[prefix].append(f)
print(f'Found {len(list(chain(*prefix_groups.values())))} files for prefix {prefix_regex}', file=sys.stderr)
# Write samplesheet
with open(out_samplesheet, 'w') as out_f:
writer = csv.writer(out_f)
writer.writerow(['sample', 'population', 'fastq_1', 'fastq_2'])
for prefix, files in prefix_groups.items():
# Sort files by lane
files.sort()
# Group files by read type
r1_files = [f for f in files if any(s in f for s in r1_suffixes)]
r2_files = [f for f in files if any(s in f for s in r2_suffixes)]
# Ensure the same number of R1 and R2 files
if len(r1_files) != len(r2_files):
print(f'Warning: Mismatched R1 and R2 files for prefix {prefix}', file=sys.stderr)
continue
# Write paired files to samplesheet
for r1, r2 in zip(r1_files, r2_files):
writer.writerow([prefix, pop_name, r1, r2])
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--prefix_regex','-p' ,default="^P\\d+_\\d+", help='Regex to group files by (default: "P\\d+_\\d+")')
parser.add_argument('in_folder', help='Folder containing fastq files')
parser.add_argument('out_samplesheet', help='Samplesheet file')
args=parser.parse_args()
main(args.in_folder, args.prefix_regex, args.out_samplesheet)