forked from AlexanderLabWHOI/eukrhythmic
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheukrhythmic
More file actions
157 lines (149 loc) · 9.71 KB
/
eukrhythmic
File metadata and controls
157 lines (149 loc) · 9.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
configfile: "config.yaml"
import io
import os
from os import listdir
from os.path import isfile, join
import sys
import pandas as pd
import numpy as np
import pathlib
from snakemake.exceptions import print_exception, WorkflowError
sys.path.insert(1, 'scripts')
# Import relevant variable from config; exit if not supplied
import importworkspace
from importworkspace import *
os.system("python " + os.path.join("scripts", "importworkspace.py"))
# Contains function to check that variables are present and formatted correctly beyond provided values in config file
import checkrequirements
from checkrequirements import *
## CHECK THAT ALL REQUIREMENTS ARE SATISFIED BY EXECUTING checkrequirements() from `scripts/checkrequirements.py` ##
checkrequirementsfct()
include: "rules/fastqc-snake.smk"
include: "rules/bbmap-snake.smk"
include: "rules/trimmomatic-snake.smk"
include: "rules/fastqc-trimmed-snake.smk"
include: "rules/trinity-snake.smk"
include: "rules/velvet-snake.smk"
include: "rules/megahit-snake.smk"
include: "rules/transabyss-snake.smk"
include: "rules/transabyss-merge-snake.smk"
include: "rules/quast-snake.smk"
#include: "rules/cd-hit-snake.smk"
include: "rules/mmseqs-snake.smk"
include: "rules/manipnames-snake.smk"
include: "rules/transdecoder-snake.smk"
include: "rules/busco-snake.smk"
include: "rules/salmon-snake.smk"
include: "rules/spades-snake.smk"
include: "rules/annotate-snake.smk"
include: "rules/hardclean-snake.smk"
#include: "rules/spike-snake.smk"
include: "rules/eukulele-snake.smk"
ruleorder: trimmomatic > trimmomatic_SE
ruleorder: trinity > trinity_SE
ruleorder: megahit > megahit_SE
ruleorder: velvet > velvet_SE
ruleorder: transdecoder_indiv > transdecoder_final_proteins > transdecoder_by_assembly
ruleorder: transdecoder_indiv_clean > transdecoder_finalproteins_clean > transdecoder_by_assembly_clean
ruleorder: salmon_indiv > salmon_clustering
ruleorder: salmon_clustering > salmon_clustering_raw
ruleorder: combinequastmerge > quast_merged_transdecoded
#ruleorder: clustering_mega_merge > clustering_by_assembly_group
ruleorder: clustering_mega_merge_mmseqs > clustering_by_assembly_group_mmseqs
ruleorder: salmon_clustering_raw > salmon_clustering
#ruleorder: merge_all > copies
#ruleorder: salmon_rename > salmon_clustering_against_mega
#ruleorder: salmon_rename > salmon_clustering
ruleorder: run_busco > busco
rule all:
input:
# FASTQC OUTPUTS
fastqc1 = expand([os.path.join("{base}", "qc", "fastqc", "{sample}_{num}_fastqc.html"),
os.path.join("{base}", "qc", "fastqc", "{sample}_{num}_fastqc.zip")], zip,
base = OUTPUTDIR, sample = filenames, num = singleorpaired),
# MULTIQC OUTPUTS
multiqc1 = expand(os.path.join("{base}", "qc", "multiqc", "firstqcreport", "multiqc_report.html"), zip,
base = OUTPUTDIR),
# BBMAP OUTPUTS
bbmap = expand(os.path.join("{base}", "bbmap", "{sample}_{num}.clean.fastq.gz"), zip, base = OUTPUTDIR,
sample = filenames, num = singleorpaired),
# TRIMMOMATIC OUTPUTS
trimmed = expand([os.path.join("{base}", "firsttrim", "{sample}_1.trimmed.fastq.gz"),
os.path.join("{base}", "firsttrim", "{sample}_2.trimmed.fastq.gz")], zip,
base = OUTPUTDIR, sample = filenames),
# FASTQC 2 OUTPUTS (trimmed)
fastqc2 = expand([os.path.join("{base}", "qc", "fastqc_trimmed", "{sample}_{num}.trimmed_fastqc.html"),
os.path.join("{base}", "qc", "fastqc_trimmed", "{sample}_{num}.trimmed_fastqc.zip")],
zip, base = OUTPUTDIR, sample = filenames, num = singleorpaired),
# MULTIQC 2 OUTPUTS
multiqc2 = expand(os.path.join("{base}", "qc", "multiqc", "trimmedqcreport", "multiqc_report.html"), zip,
base = OUTPUTDIR),
# ASSEMBLER OUTPUTS
assemblersout = expand(os.path.join("{base}", "{assembly}_{assembler}.fasta"),
base = ASSEMBLEDDIR, assembly = assemblygroups, assembler = ASSEMBLERS),
# QUAST OUTPUTS
quast = expand(os.path.join("{base}", "quast", "{assembly}"), base = OUTPUTDIR, assembly = assemblygroups),
# COMBINE QUAST OUTPUTS
quastcombine = expand(os.path.join("{base}", "quast", "fullresults", "allresults.tsv"), base = OUTPUTDIR),
# TRANSDECODER OUTPUTS - optionally, run TransDecoder on the individual assemblies
transdecoder_indiv = expand(os.path.join("{base}", "transdecoder_indiv",
"{assembly}_{assembler}.fasta.transdecoder.{tdextensions}"),
base = OUTPUTDIR, assembly = assemblygroups, assembler = ASSEMBLERS,
tdextensions = TDEXTENSIONS),
# INDIVIDUAL CLUSTERING OUTPUTS
# clustering_each_assembler = expand(os.path.join("{base}", "cluster1", "{assembly}_{assembler}.fasta"),
# base = OUTPUTDIR, assembly = assemblygroups, assembler = ASSEMBLERS),
# MERGED CLUSTERING OUTPUTS - cluster on merged samples by assembly group and mega
clustering_by_assembly_group = expand(os.path.join("{base}", "cluster_{folder}", "{assembly}_merged.fasta"),
base = OUTPUTDIR, folder = "by_assembly_group", assembly = assemblygroups),
clustering_mega_merge = expand(os.path.join("{base}", "cluster_{folder}", "{assembly}_merged.fasta"),
base = OUTPUTDIR, folder = "mega_merge", assembly = "merged"),
# SALMON QUANTIFICATION OF RAW AGAINST INDIVIDUAL ASSEMBLIES/ASSEMBLERS
salmon_indiv = expand(os.path.join("{base}", "indiv_salmon", "salmon_quant_assembly_{assembly}_{assembler}",
"quant.sf"), base = OUTPUTDIR, assembly = assemblygroups, assembler = ASSEMBLERS),
salmon_merged = expand(os.path.join("{base}", "merged_salmon", "salmon_quant_assembly_{assembly}", "quant.sf"),
base = OUTPUTDIR, assembly = assemblygroups),
# SALMON QUANTIFICATION OF RAW AGAINST MERGED BY ASSEMBLY GROUP
salmon_by_assembly = expand(os.path.join("{base}", "salmon_{folder}", "salmon_quant_assembly_{assembly}",
"quant.sf"), base = OUTPUTDIR, folder = "by_assembly_group", assembly = assemblygroups),
# SALMON QUANTIFICATION OF ALL RAW FILES AGAINST MEGA-MERGED ASSEMBLY
salmon_mega_merge = expand(os.path.join("{base}", "salmon_{folder}", "salmon_quant_assembly_{assembly}", "quant.sf"),
base = OUTPUTDIR, folder = "mega_merge", assembly = "merged"),
# SALMON QUANTIFICATION OF INDIVIDUAL RAW FILES FROM EACH AG AGAINST MEGA-MERGED ASSEMBLY
salmon_mega_merge_raw = expand(os.path.join("{base}", "salmon_{folder}", "raw_individual",
"salmon_quant_assembly_{assembly}", "quant.sf"),
base = OUTPUTDIR, folder = "mega_merge", assembly = assemblygroups),
# TRANSDECODER ON FINAL AND MEGA-MERGED ASSEMBLY
transdecoder_mega_merge = expand(os.path.join("{base}", "transdecoder_{folder}_finalproteins",
"{assembly}.fasta.transdecoder.{tdextensions}"),
base = OUTPUTDIR, folder = "mega_merge", assembly = "merged",
tdextensions = TDEXTENSIONS),
trandecoder_by_assembly_group = expand(os.path.join("{base}", "transdecoder_{folder}_finalproteins",
"{assembly}.fasta.transdecoder.{tdextensions}"),
base = OUTPUTDIR, folder = "by_assembly_group",
assembly = assemblygroups, tdextensions = TDEXTENSIONS),
# QUAST QUALITY ASSESSMENT OF FINAL ASSEMBLY
quastfinal = expand(os.path.join("{base}", "quast_{folder}", "{assembly}"), base = OUTPUTDIR,
folder = "mega_merge", assembly = "merged"),
quast_merged_mega_merge = expand(os.path.join("{base}", "quast_{folder}", "{assembly}"), base = OUTPUTDIR,
folder = "by_assembly_group", assembly = assemblygroups),
# COMBINE QUAST MERGED OUTPUTS FOR BY ASSEMBLY GROUP
quastmergedcombine = expand(os.path.join("{base}", "quast_{folder}", "fullresults", "allresults.tsv"),
base = OUTPUTDIR, folder = "by_assembly_group"),
# BUSCO ASSESSMENT OF FINAL ASSEMBLY
busco = expand(os.path.join("{base}", "busco", "{database}", "{folder}", "{assembly}"), base = OUTPUTDIR,
database = "eukaryota", folder = "mega_merge", assembly = "merged"), # eukaryota, bacteria
# HMMER ALIGNMENT OF FINAL ASSEMBLY BEFORE MEGA-MERGE
hmmer = expand(os.path.join("{base}", "pfam", "{folder}", "{assembly}.tblout"), base = OUTPUTDIR,
folder = "by_assembly_group", assembly = assemblygroups),
# DIAMOND ALIGNMENT AND KEGG ANNOTATION
kegg = expand(os.path.join("{base}", "kegg", "{folder}", "{assembly}_kegg.csv"), base = OUTPUTDIR,
folder = "by_assembly_group", assembly = assemblygroups),
# QUANTIFICATION BASED ON SPIKE FILE
#copies = expand(os.path.join(OUTPUTDIR, "salmon_{folder}", "copiesperL.tab"),
# folder = "by_assembly_group",
# base = OUTPUTDIR),
# EUKULELE
eukulele = expand(os.path.join(OUTPUTDIR, "eukulele_{folder}"),
folder = "by_assembly_group",
base = OUTPUTDIR)