From 7629ceeebfaf7f6a98c615b1bb13823b6bb78609 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 14 Nov 2025 12:04:49 -0800 Subject: [PATCH 1/2] ingest: Add rules to extract "RESTRICTED" data Extract "OPEN" and "RESTRICTED" data into separate files that are uploaded to S3 separately. This will reduce the amount of duplicate data that we host on S3. Outside of the changes in the workflow, we should delete the previously uploaded "*_with_restricted" files from S3 so that they are not confused with the new "*_restricted" files added here. --- ingest/Snakefile | 20 ++++++++++++++++++- .../nextstrain-automation/config.yaml | 5 +++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/ingest/Snakefile b/ingest/Snakefile index ac5e56fb..38731495 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -55,7 +55,25 @@ rule extract_open_data: augur filter --metadata {input.metadata} \ --sequences {input.sequences} \ --metadata-id-columns PPX_accession \ - --exclude-where "dataUseTerms=RESTRICTED" \ + --include-where "dataUseTerms=OPEN" \ + --output-metadata {output.metadata} \ + --output-sequences {output.sequences} + """ + + +rule extract_restricted_data: + input: + metadata="results/metadata.tsv", + sequences="results/sequences.fasta", + output: + metadata="results/metadata_restricted.tsv", + sequences="results/sequences_restricted.fasta", + shell: + """ + augur filter --metadata {input.metadata} \ + --sequences {input.sequences} \ + --metadata-id-columns PPX_accession \ + --include-where "dataUseTerms=RESTRICTED" \ --output-metadata {output.metadata} \ --output-sequences {output.sequences} """ diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml index b9f9b9a2..41aa6971 100644 --- a/ingest/build-configs/nextstrain-automation/config.yaml +++ b/ingest/build-configs/nextstrain-automation/config.yaml @@ -16,14 +16,15 @@ upload: # to avoid duplicate files with different compressions files_to_upload: ppx_with_restricted.ndjson.zst: results/ppx.ndjson.zst - metadata_with_restricted.tsv.zst: results/metadata.tsv - sequences_with_restricted.fasta.zst: results/sequences.fasta + nextclade_with_restricted.tsv.zst: results/nextclade.tsv alignment_with_restricted.fasta.zst: results/alignment.fasta translations_with_restricted.zip: results/translations.zip metadata.tsv.zst: results/metadata_open.tsv sequences.fasta.zst: results/sequences_open.fasta + metadata_restricted.tsv.zst: results/metadata_restricted.tsv + sequences_restricted.tsv.zst: results/sequences_restricted.fasta cloudfront_domain: 'data.nextstrain.org' From 7cc5cea2b320e290197224afe8592c27ee84eaf2 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 14 Nov 2025 12:08:09 -0800 Subject: [PATCH 2/2] phylo: Update to start from multiple inputs Since the previous commit separates the OPEN and RESTRICTED files on S3, update the phylo config to start from these multiple inputs. --- phylogenetic/defaults/clade-i/config.yaml | 9 ++++++--- phylogenetic/defaults/hmpxv1/config.yaml | 9 ++++++--- phylogenetic/defaults/hmpxv1_big/config.yaml | 9 ++++++--- phylogenetic/defaults/mpxv/config.yaml | 9 ++++++--- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/phylogenetic/defaults/clade-i/config.yaml b/phylogenetic/defaults/clade-i/config.yaml index 3ecc34a8..79c97005 100644 --- a/phylogenetic/defaults/clade-i/config.yaml +++ b/phylogenetic/defaults/clade-i/config.yaml @@ -1,7 +1,10 @@ inputs: - - name: pathoplexus - metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata_with_restricted.tsv.zst" - sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences_with_restricted.fasta.zst" + - name: ppx_open + metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.zst" + sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.zst" + - name: ppx_restricted + metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata_restricted.tsv.zst" + sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences_restricted.fasta.zst" reference: "defaults/clade-i/reference.fasta" genome_annotation: "defaults/clade-i/genome_annotation.gff3" diff --git a/phylogenetic/defaults/hmpxv1/config.yaml b/phylogenetic/defaults/hmpxv1/config.yaml index d4b556d4..9aab378f 100644 --- a/phylogenetic/defaults/hmpxv1/config.yaml +++ b/phylogenetic/defaults/hmpxv1/config.yaml @@ -1,7 +1,10 @@ inputs: - - name: pathoplexus - metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata_with_restricted.tsv.zst" - sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences_with_restricted.fasta.zst" + - name: ppx_open + metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.zst" + sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.zst" + - name: ppx_restricted + metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata_restricted.tsv.zst" + sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences_restricted.fasta.zst" reference: "defaults/reference.fasta" genome_annotation: "defaults/genome_annotation.gff3" diff --git a/phylogenetic/defaults/hmpxv1_big/config.yaml b/phylogenetic/defaults/hmpxv1_big/config.yaml index 8db5676b..20a90086 100644 --- a/phylogenetic/defaults/hmpxv1_big/config.yaml +++ b/phylogenetic/defaults/hmpxv1_big/config.yaml @@ -1,7 +1,10 @@ inputs: - - name: pathoplexus - metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata_with_restricted.tsv.zst" - sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences_with_restricted.fasta.zst" + - name: ppx_open + metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.zst" + sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.zst" + - name: ppx_restricted + metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata_restricted.tsv.zst" + sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences_restricted.fasta.zst" reference: "defaults/reference.fasta" genome_annotation: "defaults/genome_annotation.gff3" diff --git a/phylogenetic/defaults/mpxv/config.yaml b/phylogenetic/defaults/mpxv/config.yaml index b0508689..d8604761 100644 --- a/phylogenetic/defaults/mpxv/config.yaml +++ b/phylogenetic/defaults/mpxv/config.yaml @@ -1,7 +1,10 @@ inputs: - - name: pathoplexus - metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata_with_restricted.tsv.zst" - sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences_with_restricted.fasta.zst" + - name: ppx_open + metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.zst" + sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.zst" + - name: ppx_restricted + metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata_restricted.tsv.zst" + sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences_restricted.fasta.zst" auspice_config: "defaults/mpxv/auspice_config.json" include: "defaults/mpxv/include.txt"