From 638f8a2ea2dc813e5748bb16b6162dec8aca2e24 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 18 Feb 2026 12:22:04 +0000 Subject: [PATCH 1/5] Initial plan From 923d58a24b5ed920821df7e2e955eba1641ea7ce Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 18 Feb 2026 12:25:44 +0000 Subject: [PATCH 2/5] Add Dockerfile and update README with Docker instructions Co-authored-by: neliebi <51783034+neliebi@users.noreply.github.com> --- Dockerfile | 38 ++++++++++++++++ README.md | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..52410be --- /dev/null +++ b/Dockerfile @@ -0,0 +1,38 @@ +# Use Python 3.8 as base image +FROM python:3.8-slim + +# Set environment variables +ENV PYTHONUNBUFFERED=1 \ + DEBIAN_FRONTEND=noninteractive + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + g++ \ + libxml2-dev \ + libxslt-dev \ + libz-dev \ + ncbi-blast+ \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Copy requirements first for better caching +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . . + +# Create directories for data mounting +RUN mkdir -p /data /config + +# Set the config file location +ENV CONFIG_PATH=/config/config.ini + +# Default command shows help +CMD ["python", "AddedAnnotations.py", "--help"] diff --git a/README.md b/README.md index aa2ccc8..7ea0081 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,10 @@ This repository provides tools and scripts for extracting and adding annotations ### Table of Contents * Installation +* Docker Installation * Configuration * Usage +* Docker Usage * Contributing * License @@ -15,6 +17,18 @@ This repository provides tools and scripts for extracting and adding annotations To install the necessary dependencies, run: pip install -r requirements.txt +### Docker Installation + +You can also run the scripts using Docker, which provides a containerized environment with all dependencies pre-installed. + +#### Building the Docker Image + +```bash +docker build -t added-annotations . +``` + +This will create a Docker image with Python 3.8, BLAST+, and all required Python dependencies. + ### Configuration The repository uses a config.ini file for configuration, which is not included in the repository. This file should be created in the root directory of the project with the following structure: @@ -43,6 +57,35 @@ rfam_ftp: /rfam_files_combined.txt pmc: https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST ``` +#### Docker Configuration + +When using Docker, the config.ini file should use container paths. Create a config file at `/path/on/host/config.ini` with the following structure: + +``` +[file_paths] +CP_ftp: /data/cpx/ +components_cif: /data/components.cif +pmc_ftp_gz: /data/pmc/PMID_PMCID_DOI.csv.gz +pmc_ftp: /data/pmc/PMID_PMCID_DOI.csv +assembly_ftp: /data/pdbe/assembly/ +BLAST_DB: /data/uniprotkb_swissprot +BLASTP_BIN: blastp +sifts_GO: /data/pdbe/go/pdb_chain_go.csv +GO_obo: /data/go.obo +emdb_empiar_list: /data/emdb_empiar.json +sifts: /data/sifts/ +alphafold_ftp: /data/accession_ids.txt +uniprot_tab: /data/uniprot.tsv + +[api] +pmc: https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST + +[params] +minimal_map_fragment_length: 15 +``` + +**Note:** The paths in the Docker config should match the container mount points (e.g., `/data/...`), not the host paths. + #### File Sources and Download Links | File | Descritption | Download Link | |-------------|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------| @@ -91,6 +134,94 @@ fetch_afdb.py: python fetch_afdb.py -w ``` +### Docker Usage + +When running the scripts in Docker, you need to mount your data directories and config file as read-only volumes. The general pattern is: + +```bash +docker run --rm \ + -v /path/on/host/config.ini:/config/config.ini:ro \ + -v /path/on/host/data:/data:ro \ + -v /path/on/host/output:/output \ + added-annotations python +``` + +#### Docker Volume Mounting + +- `-v /path/on/host/config.ini:/config/config.ini:ro` - Mount your config file as read-only +- `-v /path/on/host/data:/data:ro` - Mount your data directory containing all required files (cpx, components.cif, etc.) as read-only +- `-v /path/on/host/output:/output` - Mount output directory for writing results (read-write) + +**Important:** +- Use `:ro` flag for read-only mounts on data and config to prevent accidental modifications +- Ensure your config.ini uses container paths (e.g., `/data/...`) that match your volume mounts +- Map all directories referenced in your config.ini file to appropriate container paths + +#### Running Scripts in Docker + +Execute the scripts independently in the following recommended order: + +##### EMPIAR mapping +```bash +docker run --rm \ + -v /path/on/host/config.ini:/config/config.ini:ro \ + -v /path/on/host/empiar_metadata:/empiar_metadata:ro \ + -v /path/on/host/output:/output \ + added-annotations python fetch_empiar.py -w /output -f /empiar_metadata +``` + +##### Publication mapping +```bash +docker run --rm \ + -v /path/on/host/config.ini:/config/config.ini:ro \ + -v /path/on/host/emdb_metadata:/emdb_metadata:ro \ + -v /path/on/host/output:/output \ + added-annotations python fetch_pubmed.py -w /output -f /emdb_metadata +``` + +##### Protein, complexes and ligands mapping +```bash +docker run --rm \ + -v /path/on/host/config.ini:/config/config.ini:ro \ + -v /path/on/host/data:/data:ro \ + -v /path/on/host/emdb_metadata:/emdb_metadata:ro \ + -v /path/on/host/output:/output \ + added-annotations python AddedAnnotations.py -w /output -f /emdb_metadata --all -t 4 +``` + +##### AlphaFold DB mapping +```bash +docker run --rm \ + -v /path/on/host/config.ini:/config/config.ini:ro \ + -v /path/on/host/data:/data:ro \ + -v /path/on/host/output:/output \ + added-annotations python fetch_afdb.py -w /output +``` + +##### Generate Europe PMC Links +```bash +docker run --rm \ + -v /path/on/host/config.ini:/config/config.ini:ro \ + -v /path/on/host/output:/output \ + added-annotations python generate_eupmc_links.py +``` + +##### Compare Release +```bash +docker run --rm \ + -v /path/on/host/config.ini:/config/config.ini:ro \ + -v /path/on/host/latest:/latest:ro \ + -v /path/on/host/previous:/previous:ro \ + added-annotations python compare_release.py /latest /previous +``` + +##### Write XML files +```bash +docker run --rm \ + -v /path/on/host/output:/output \ + added-annotations python write_xml.py /output +``` + ### Further information For more information about EMICSS, visit the official EMICSS website (https://www.ebi.ac.uk/emdb/emicss). This page provides detailed information about the EMDB/EMICSS project. From 71644bc9280c29d94a0b090595cda68f301fb40c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 18 Feb 2026 12:26:38 +0000 Subject: [PATCH 3/5] Add .dockerignore and example Docker config file Co-authored-by: neliebi <51783034+neliebi@users.noreply.github.com> --- .dockerignore | 44 +++++++++++++++++++++++++++++++++++++++ README.md | 2 +- config.ini.docker-example | 26 +++++++++++++++++++++++ 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 .dockerignore create mode 100644 config.ini.docker-example diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..8db0002 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,44 @@ +# Git +.git +.github +.gitignore + +# Config files (users should mount their own) +config.ini +*.ini +!config.ini.docker-example + +# Logs +*.log + +# Data files +*.tsv +*.csv +*.gz + +# Test files +.coverage +unit_test/ + +# Python cache +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +*.so + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Temporary files +*.tmp +tmp/ diff --git a/README.md b/README.md index 7ea0081..36d62c3 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ pmc: https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST #### Docker Configuration -When using Docker, the config.ini file should use container paths. Create a config file at `/path/on/host/config.ini` with the following structure: +When using Docker, the config.ini file should use container paths. An example configuration file is provided in `config.ini.docker-example`. Create your own config file on the host machine with the following structure: ``` [file_paths] diff --git a/config.ini.docker-example b/config.ini.docker-example new file mode 100644 index 0000000..a4eb6a6 --- /dev/null +++ b/config.ini.docker-example @@ -0,0 +1,26 @@ +# Example config.ini file for Docker usage +# Copy this file to your host machine and update the paths to match your setup +# All paths should point to locations inside the container (e.g., /data/...) +# Mount your actual data directories when running the container + +[file_paths] +# Point to the container mount point, not the host path +CP_ftp = /data/cpx/ +components_cif = /data/components.cif +pmc_ftp_gz = /data/pmc/PMID_PMCID_DOI.csv.gz +pmc_ftp = /data/pmc/PMID_PMCID_DOI.csv +assembly_ftp = /data/pdbe/assembly/ +BLAST_DB = /data/uniprotkb_swissprot +BLASTP_BIN = blastp +sifts_GO = /data/pdbe/go/pdb_chain_go.csv +GO_obo = /data/go.obo +emdb_empiar_list = /data/emdb_empiar.json +sifts = /data/sifts/ +alphafold_ftp = /data/accession_ids.txt +uniprot_tab = /data/uniprot.tsv + +[api] +pmc = https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST + +[params] +minimal_map_fragment_length = 15 From 98335ae8ab3799ab6e836e6ee27050622f560e10 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 18 Feb 2026 12:27:29 +0000 Subject: [PATCH 4/5] Fix config file syntax to use equals signs instead of colons Co-authored-by: neliebi <51783034+neliebi@users.noreply.github.com> --- README.md | 66 +++++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 36d62c3..c9f9478 100644 --- a/README.md +++ b/README.md @@ -35,26 +35,26 @@ The repository uses a config.ini file for configuration, which is not included i ``` [file_paths] -uniprot_tab: /uniprot.tsv -CP_ftp: /complextab -components_cif: /components.cif -chem_comp_list: /chem_comp_list.xml -pmc_ftp_gz: /PMID_PMCID_DOI.csv.gz -pmc_ftp: /PMID_PMCID_DOI.csv -emdb_pubmed: /emdb_pubmed.log -emdb_orcid: /emdb_orcid.log -assembly_ftp: /assembly/ -BLAST_DB: /ncbi-blast-2.13.0+/database/uniprot_sprot -BLASTP_BIN: blastp -sifts_GO: /pdb_chain_go.csv -GO_obo: /go.obo -GO_interpro: /nfs/ftp/pub/databases/GO/goa/external2go/interpro2go -sifts: /split_xml/ -alphafold_ftp: /accession_ids.txt -rfam_ftp: /rfam_files_combined.txt +uniprot_tab = /uniprot.tsv +CP_ftp = /complextab +components_cif = /components.cif +chem_comp_list = /chem_comp_list.xml +pmc_ftp_gz = /PMID_PMCID_DOI.csv.gz +pmc_ftp = /PMID_PMCID_DOI.csv +emdb_pubmed = /emdb_pubmed.log +emdb_orcid = /emdb_orcid.log +assembly_ftp = /assembly/ +BLAST_DB = /ncbi-blast-2.13.0+/database/uniprot_sprot +BLASTP_BIN = blastp +sifts_GO = /pdb_chain_go.csv +GO_obo = /go.obo +GO_interpro = /nfs/ftp/pub/databases/GO/goa/external2go/interpro2go +sifts = /split_xml/ +alphafold_ftp = /accession_ids.txt +rfam_ftp = /rfam_files_combined.txt [api] -pmc: https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST +pmc = https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST ``` #### Docker Configuration @@ -63,25 +63,25 @@ When using Docker, the config.ini file should use container paths. An example co ``` [file_paths] -CP_ftp: /data/cpx/ -components_cif: /data/components.cif -pmc_ftp_gz: /data/pmc/PMID_PMCID_DOI.csv.gz -pmc_ftp: /data/pmc/PMID_PMCID_DOI.csv -assembly_ftp: /data/pdbe/assembly/ -BLAST_DB: /data/uniprotkb_swissprot -BLASTP_BIN: blastp -sifts_GO: /data/pdbe/go/pdb_chain_go.csv -GO_obo: /data/go.obo -emdb_empiar_list: /data/emdb_empiar.json -sifts: /data/sifts/ -alphafold_ftp: /data/accession_ids.txt -uniprot_tab: /data/uniprot.tsv +CP_ftp = /data/cpx/ +components_cif = /data/components.cif +pmc_ftp_gz = /data/pmc/PMID_PMCID_DOI.csv.gz +pmc_ftp = /data/pmc/PMID_PMCID_DOI.csv +assembly_ftp = /data/pdbe/assembly/ +BLAST_DB = /data/uniprotkb_swissprot +BLASTP_BIN = blastp +sifts_GO = /data/pdbe/go/pdb_chain_go.csv +GO_obo = /data/go.obo +emdb_empiar_list = /data/emdb_empiar.json +sifts = /data/sifts/ +alphafold_ftp = /data/accession_ids.txt +uniprot_tab = /data/uniprot.tsv [api] -pmc: https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST +pmc = https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST [params] -minimal_map_fragment_length: 15 +minimal_map_fragment_length = 15 ``` **Note:** The paths in the Docker config should match the container mount points (e.g., `/data/...`), not the host paths. From 5dd6d6d0f0fbe0bf912c30fe7e4d0c0cbc75ad5c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 18 Feb 2026 12:28:12 +0000 Subject: [PATCH 5/5] Fix typo in README and add params section to config example Co-authored-by: neliebi <51783034+neliebi@users.noreply.github.com> --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c9f9478..cf5c3ab 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,9 @@ rfam_ftp = /rfam_files_combined.txt [api] pmc = https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST + +[params] +minimal_map_fragment_length = 15 ``` #### Docker Configuration @@ -87,7 +90,7 @@ minimal_map_fragment_length = 15 **Note:** The paths in the Docker config should match the container mount points (e.g., `/data/...`), not the host paths. #### File Sources and Download Links -| File | Descritption | Download Link | +| File | Description | Download Link | |-------------|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------| | uniprot.tsv | UniProt annpotations | https://rest.uniprot.org/uniprotkb/stream?fields=accession,xref_pdb,protein_name&query=((database:pdb))&format=tsv&compressed=false | | complextab | Complex Portal data | https://ftp.ebi.ac.uk/pub/databases/complexportal/complexes.tab.gz |