-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSnakefile
More file actions
94 lines (83 loc) · 2.96 KB
/
Snakefile
File metadata and controls
94 lines (83 loc) · 2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
configfile: "config.yaml"
import io
import os
import glob
import pandas as pd
import numpy as np
import pathlib
from snakemake.exceptions import print_exception, WorkflowError
#----SET VARIABLES----#
PROTEIN_DIR = config['protein_dir']
ALIGNMENT_DIR = config['alignment_dir']
MAG_DIR = config['mag_dir']
OUTPUT_DIR = config['output_dir']
GENEFILE = config['gene_list']
genelist = []
with open(GENEFILE, 'r') as f:
for line in f:
genelist.append(line.strip())
SAMPLES = [os.path.basename(f).replace(".proteins.faa", "") for f in glob.glob(PROTEIN_DIR + "/*.proteins.faa")]
#----RULES----#
localrules: parse_hmmsearch
rule all:
input:
hmmbuild = expand('{base}/{gene}/gene_alignment_profile.hmm', base = ALIGNMENT_DIR, gene=genelist),
hmmsearch = expand('{base}/hmm_results/{gene}/{sample}.hmmout', base = OUTPUT_DIR, gene = genelist, sample = SAMPLES ),
hmmtable = expand('{base}/hmm_results/{gene}/{sample}.eval.tab', base = OUTPUT_DIR, gene = genelist, sample = SAMPLES ),
hmm_allhits = expand('{base}/hmm_results/{gene}/{sample}.hits.faa', base = OUTPUT_DIR, gene = genelist, sample = SAMPLES ),
hmm_mags = expand("{base}/mag_results/{gene}.maghits.contigs.csv", base = OUTPUT_DIR, gene = genelist)
rule hmmbuild:
input: alignment = ALIGNMENT_DIR + "/{gene}/gene_alignment.aln"
output: hmm = ALIGNMENT_DIR + "/{gene}/gene_alignment_profile.hmm"
conda:
"envs/hmmer.yaml"
shell:
"""
hmmbuild {output.hmm} {input.alignment}
"""
rule hmmsearch:
input:
proteins = PROTEIN_DIR + "/{sample}.proteins.faa",
hmm = ALIGNMENT_DIR + "/{gene}/gene_alignment_profile.hmm"
output:
hmmout = OUTPUT_DIR + "/hmm_results/{gene}/{sample}.hmmout",
params:
cpu = "--cpu 2"
conda:
"envs/hmmer.yaml"
shell:
"""
hmmsearch {params.cpu} {input.hmm} {input.proteins} > {output.hmmout}
"""
rule parse_hmmsearch:
input:
OUTPUT_DIR + "/hmm_results/{gene}/{sample}.hmmout"
output:
OUTPUT_DIR + "/hmm_results/{gene}/{sample}.eval.tab"
conda:
"envs/biopython.yaml"
script:
"scripts/bioconda-parser.py"
rule get_contig_hits:
input:
eval_tab = OUTPUT_DIR + "/hmm_results/{gene}/{sample}.eval.tab",
proteins = PROTEIN_DIR + "/{sample}.proteins.faa",
output:
OUTPUT_DIR + "/hmm_results/{gene}/{sample}.hits.faa"
conda:
"envs/seqtk.yaml"
shell:
"""
cut -f1 {input.eval_tab} | seqtk subseq {input.proteins} - > {output}
"""
rule get_MAG_hits:
input:
genelist = OUTPUT_DIR + "/curated-gene-lists/{gene}.threshold.csv",
magIDList = MAG_DIR + "/mag_fasta_headers.txt"
output:
maglist = OUTPUT_DIR + "/mag_results/{gene}.maghits.contigs.csv"
conda: "envs/biopython.yaml"
shell:
"""
python scripts/mag-finder.py {input.genelist} {input.magIDList} {output.maglist}
"""