indrops/Snakefile at master · adrdufour/indrops · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
shell.executable("/bin/bash")
import itertools

conda: "/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
configfile: "/home/adrien.dufour/NeuroDev_ADD/SingleCell/indrops-master/project.yaml"
#shell.prefix("conda activate indrops")

YAMLPATH = "/home/adrien.dufour/NeuroDev_ADD/SingleCell/indrops-master/project.yaml"
FASTQ = []
LIBRARY = []
WORKERS = range(config['cores']['default'])
WORKER = ['1', '2', '3']
READS = ['R1', 'R2']
SPLIT = []
RUN = []

for each in config['sequencing_runs']:
    RUN = each['name']
    dir_lib = each['dir']
    SPLIT = each['split_affixes']
    LIBRARY.append(each['library_name'])

def aggregate_input(wildcards):
    library_quant = [os.path.join(config['project_dir'], wildcards.library, "quant_dir",
                     "worker{i}_".format(i=i) + str(config['cores']['quantify_barcodes']) + ".counts.tsv") \
                    for i in WORKERS]
    return library_quant

rule all:
    input:
        [os.path.join(config['project_dir'], 'fastqc', x.replace('.fastq', '_fastqc.html')) for x in FASTQ if 'R1' in x],
        expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq"), split=SPLIT, library=LIBRARY, run=RUN),
        expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.counts.pickle"), split=SPLIT, library=LIBRARY, run=RUN),
        expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}metrics.yaml"), split=SPLIT, library=LIBRARY, run=RUN),
        expand(os.path.join(config['project_dir'], "{library}", "abundant_barcodes.pickle"), split=SPLIT, library=LIBRARY, run=RUN),
        expand(os.path.join(config['project_dir'], "{library}", "{library}.barcode_abundance_by_barcode.png"), split=SPLIT, library=LIBRARY, run=RUN),
        expand(os.path.join(config['project_dir'], "{library}", "{library}.barcode_abundance.png"), library=LIBRARY),
        expand(os.path.join(config['project_dir'], "{library}", "{library}.filtering_stats.csv"), library=LIBRARY),
        expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.sorted.fastq.gz"), split=SPLIT, run=RUN, library=LIBRARY),
        expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.sorted.fastq.gz.index.pickle"), split=SPLIT, run=RUN, library=LIBRARY),
        expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".counts.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
        expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".metrics.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
        expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".ambig.counts.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
        expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".ambig.partners"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
        expand(os.path.join(config['project_dir'], "{library}", "{library}.bam"), library=LIBRARY),
        expand(os.path.join(config['project_dir'], "{library}", "{library}.bam.bai"), library=LIBRARY),
        expand(os.path.join(config['project_dir'], "{library}", "{library}.counts.tsv.gz"), library=LIBRARY),
        expand(os.path.join(config['project_dir'], "{library}", "{library}.quant_metrics.tsv.gz"), library=LIBRARY)

rule fastqc_biological_reads:
    input:
        expand(os.path.join("home/adrien.dufour/PROTECT/debug_data/", "{split}_R1.fastq"), split=SPLIT)
    params:
        outdir=os.path.join(config['project_dir'], 'fastqc')
    output:
        [os.path.join(config['project_dir'], 'fastqc', x.replace('.fastq', '_fastqc.html')) for x in FASTQ if 'R1' in x]
    shell:
        "fastqc {input} -o {params.outdir}"

rule filter_reads:
    input:
        fastq=FASTQ,
        yaml=YAMLPATH
    output:
        os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq"),
        os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.counts.pickle"),
        os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}metrics.yaml"),
    conda:
        "/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
    params:
        workers=config['cores']['default']
    log:
        #"logs/{wildcards.run}_{wildcards.library}_{params.worker}_filter.log"
    shell:
        """
        for i in {{0..{params.workers}}}; do
            python indrops.py {input.yaml} filter --runs {RUN} --libraries {LIBRARY} --total-workers {params.workers} --worker-index $i
        done;
        """
        # Resulting workload (a list of run parts), will be split among N --total-workers,
        # where worker with --worker-index i will do steps (i, N+i, 2N+i, ...)

rule abundant_barcodes:
    input:
        expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.counts.pickle"), split=SPLIT, library=LIBRARY, run=RUN),
        yaml=YAMLPATH
    output:
        os.path.join(config['project_dir'], "{library}", "abundant_barcodes.pickle"),
        os.path.join(config['project_dir'], "{library}", "{library}.barcode_abundance_by_barcode.png"),
        os.path.join(config['project_dir'], "{library}", "{library}.barcode_abundance.png"),
        os.path.join(config['project_dir'], "{library}", "{library}.filtering_stats.csv")
    conda:
        "/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
    shell:
        """
        python indrops.py {input.yaml} identify_abundant_barcodes --libraries {LIBRARY}
        """

rule sort_reads:
    input:
        expand(os.path.join(config['project_dir'], "{library}", "{library}.filtering_stats.csv"), library=LIBRARY),
        yaml=YAMLPATH
    output:
        os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.sorted.fastq.gz"),
        os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.sorted.fastq.gz.index.pickle")
    conda:
        "/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
    params:
        workers=config['cores']['default']
    log:
        #"logs/{library}_{split}_sort.log"
    shell:
        """
        for i in {{0..{params.workers}}}; do
            python indrops.py {input.yaml} sort --libraries {LIBRARY} --total-workers {params.workers} --worker-index $i
        done;
        """

rule quantify_barcodes:
    input:
        expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.sorted.fastq.gz.index.pickle"), split=SPLIT, run=RUN, library=LIBRARY),
        yaml=YAMLPATH,
    output:
        expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".counts.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
        expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".metrics.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
        expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".ambig.counts.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
        expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".ambig.partners"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True)
    conda:
        "/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
    params:
        cores=config['cores']['quantify_barcodes'],
        max_idx=config['cores']['quantify_barcodes'] - 1
    shell:
        """
        for i in {{0..{params.cores}}}; do
            python indrops.py {input.yaml} quantify --libraries {LIBRARY} --total-workers {params.cores} --worker-index $i
        done;
        """

rule aggregate_umis:
    input:
        #lambda wildcards: aggregate_input(wildcards),
        expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".counts.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
        yaml=YAMLPATH
    params:
        workers=config['cores']['default']
    output:
        expand(os.path.join(config['project_dir'], "{library}", "{library}.bam"), library=LIBRARY),
        expand(os.path.join(config['project_dir'], "{library}", "{library}.bam.bai"), library=LIBRARY),
        expand(os.path.join(config['project_dir'], "{library}", "{library}.counts.tsv.gz"), library=LIBRARY),
        expand(os.path.join(config['project_dir'], "{library}", "{library}.quant_metrics.tsv.gz"), library=LIBRARY)
    conda:
        "/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
    shell:
        """
        python indrops.py {input.yaml} aggregate --total-workers {params.workers} --libraries {LIBRARY}
        """