radQC/bin/samplesheet_gen.py at master · NationalGenomicsInfrastructure/radQC · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Searches for fastq files in a folder and groups them by a prefix regex. Combining these files
info a valid samplesheet, and joining potential samples sequenced over multiple lanes. This script
is specifically tuned for the naming conventions of NGI's sequencing facility. Your mileage may vary.

Usage:
    python samplesheet_gen.py -p <prefix_regex> <in_folder> <out_samplesheet>
Input:
    prefix_regex - Regex to group files by (default: "^P\\d+_\\d+")
    in_folder - Folder containing fastq files
    out_samplesheet - Samplesheet file
Output:
    out_samplesheet - Samplesheet file in csv format with the following columns:
        sample_name, read1, read2
"""
import sys
import os
import argparse
import re
import csv
from itertools import chain

r1_suffixes = ['_R1_', '_1.']
r2_suffixes = ['_R2_', '_2.']
pop_name = 'radQC'

def main(in_folder, prefix_regex, out_samplesheet):
    # Get all fastq files in all subfolders
    fastq_files = os.walk(in_folder)
    fastq_files = [os.path.join(root, f) for root, _, files in fastq_files for f in files if f.lower().endswith(('.fastq.gz', '.fq.gz'))]

    # Group files by prefix
    # Find the basename first, then compile the regex and apply it to the basename.
    class _BasenamePrefixMatcher:
        def __init__(self, pattern):
            self._pattern = pattern

        def search(self, path):
            basename = os.path.basename(path)
            # compile after getting the basename (pattern stays the same)
            return re.compile(self._pattern).search(basename)

    prefix_re = _BasenamePrefixMatcher(prefix_regex)
    prefix_groups = {}
    for f in fastq_files:
        m = prefix_re.search(f)
        if m:
            prefix = m.group(0)
            if prefix not in prefix_groups.keys():
                prefix_groups[prefix] = []
            if any(s in f for s in r1_suffixes) or any(s in f for s in r2_suffixes):
                prefix_groups[prefix].append(f)

    print(f'Found {len(list(chain(*prefix_groups.values())))} files for prefix {prefix_regex}', file=sys.stderr)

    # Write samplesheet
    with open(out_samplesheet, 'w') as out_f:
        writer = csv.writer(out_f)
        writer.writerow(['sample', 'population', 'fastq_1', 'fastq_2'])
        for prefix, files in prefix_groups.items():
            # Sort files by lane
            files.sort()
            # Group files by read type
            r1_files = [f for f in files if any(s in f for s in r1_suffixes)]
            r2_files = [f for f in files if any(s in f for s in r2_suffixes)]
            # Ensure the same number of R1 and R2 files
            if len(r1_files) != len(r2_files):
                print(f'Warning: Mismatched R1 and R2 files for prefix {prefix}', file=sys.stderr)
                continue
            # Write paired files to samplesheet
            for r1, r2 in zip(r1_files, r2_files):
                writer.writerow([prefix, pop_name, r1, r2])


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('--prefix_regex','-p' ,default="^P\\d+_\\d+", help='Regex to group files by (default: "P\\d+_\\d+")')
    parser.add_argument('in_folder', help='Folder containing fastq files')
    parser.add_argument('out_samplesheet', help='Samplesheet file')
    args=parser.parse_args()
    main(args.in_folder, args.prefix_regex, args.out_samplesheet)