diff --git a/biobakery-profiler/Dockerfile b/biobakery-profiler/Dockerfile index 6356888..2d6c946 100644 --- a/biobakery-profiler/Dockerfile +++ b/biobakery-profiler/Dockerfile @@ -1,65 +1,101 @@ -FROM python:3.9.17-bookworm AS dbbuild +# ============================================================ +# Stage 1: Build samtools +# ============================================================ +FROM python:3.9.17-slim-bookworm AS dbbuild + RUN apt-get update && \ - DEBIAN_FRONTEND="noninteractive" apt-get install --no-install-recommends -y wget ca-certificates build-essential zlib1g-dev libbz2-dev liblzma-dev && \ + DEBIAN_FRONTEND="noninteractive" apt-get install --no-install-recommends -y \ + wget ca-certificates build-essential zlib1g-dev libbz2-dev liblzma-dev && \ rm -rf /var/lib/apt/lists/* -# RUN mkdir -p /dbs/util/ && \ -# wget -c --read-timeout=5 http://huttenhower.sph.harvard.edu/humann_data/full_mapping_v201901b.tar.gz && \ -# cd /dbs/util/ && \ -# tar xzf /full_mapping_v201901b.tar.gz && \ -# rm /full_mapping_v201901b.tar.gz -RUN wget https://github.com/samtools/samtools/releases/download/1.15.1/samtools-1.15.1.tar.bz2 && \ - tar xjf samtools-1.15.1.tar.bz2 && cd samtools-1.15.1 && \ - ./configure --without-curses && make && make install && which samtools # - - - -#FROM ubuntu:18.04 AS runtime-image -FROM python:3.9.17-bookworm AS runtime-image +RUN wget -q https://github.com/samtools/samtools/releases/download/1.15.1/samtools-1.15.1.tar.bz2 && \ + tar xjf samtools-1.15.1.tar.bz2 && \ + cd samtools-1.15.1 && \ + ./configure --without-curses && \ + make -j$(nproc) && \ + make install && \ + which samtools +# ============================================================ +# Stage 2: Build Python packages in a venv we can copy +# ============================================================ +FROM python:3.9.17-slim-bookworm AS pybuild ARG HUMANNVERSION=0.0.0 ARG METAPHLANVERSION=0.0.0 -COPY --from=dbbuild /usr/local/bin/samtools /usr/local/bin/samtools - -WORKDIR /tmp -RUN export LC_ALL=en_US.UTF-8 && export LANG=en_US.UTF-8 - RUN apt-get update && \ - DEBIAN_FRONTEND="noninteractive" apt-get install --no-install-recommends -y build-essential libjpeg-dev zlib1g-dev libbz2-dev liblzma-dev ncbi-blast+ mafft raxml && \ + DEBIAN_FRONTEND="noninteractive" apt-get install --no-install-recommends -y \ + build-essential \ + wget \ + ca-certificates \ + patch \ + libjpeg-dev \ + zlib1g-dev \ + libbz2-dev \ + liblzma-dev && \ rm -rf /var/lib/apt/lists/* +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +COPY nucleotide_${HUMANNVERSION}.patch /tmp/nucleotide.patch +COPY config_${HUMANNVERSION}.patch /tmp/config.patch + +RUN pip install --no-cache-dir numpy cython && \ + pip install --no-cache-dir boto3 cloudpickle awscli && \ + pip install --no-cache-dir biom-format && \ + wget -qO- https://github.com/biobakery/humann/archive/refs/tags/${HUMANNVERSION}.tar.gz \ + | tar xz -C /tmp && \ + mv /tmp/humann-* /tmp/humann && \ + cd /tmp/humann && \ + patch humann/search/nucleotide.py < /tmp/nucleotide.patch && \ + patch humann/config.py < /tmp/config.patch && \ + mv setup.py tmp.py && \ + sed 's|2\.2\.3|2\.5\.1|g' tmp.py > tmp2.py && \ + sed 's|bowtie2_folder="bowtie2-2\.5\.1|bowtie2_folder="bowtie2-2\.5\.1-linux-x86_64|g' tmp2.py > setup.py && \ + pip install --no-cache-dir . --no-binary :all: && \ + pip install --no-cache-dir metaphlan==${METAPHLANVERSION} && \ + find /opt/venv -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null; \ + find /opt/venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null; \ + find /opt/venv -type d -name "test" -exec rm -rf {} + 2>/dev/null; \ + find /opt/venv -name "*.pyc" -delete 2>/dev/null; \ + find /opt/venv -name "*.pyo" -delete 2>/dev/null; \ + rm -rf /tmp/humann /tmp/*.patch + +# ============================================================ +# Stage 3: Minimal runtime image +# ============================================================ +FROM python:3.9.17-slim-bookworm AS runtime-image -# You may be tempted to do these all in one call but unfortunately -# (eg) biom-format wont build without numpy, and it wont install it automatically - -# see the glpsol issue here https://forum.biobakery.org/t/silent-errors-with-bad-install-of-libglpk/4814 - -# this nucleotide.py makes bowtie indexing parallel -ADD nucleotide_${HUMANNVERSION}.patch /tmp/nucleotide.patch -ADD config_${HUMANNVERSION}.patch /tmp/config.patch +ARG HUMANNVERSION=0.0.0 +ARG METAPHLANVERSION=0.0.0 -# We also have to bump to a more recent version of bowtie that is multithreaded, -# so we clumsily make that with some uninspired sed calls -RUN pip3 install boto3 cloudpickle awscli && \ - git clone --depth=1 --branch ${HUMANNVERSION} https://github.com/biobakery/humann.git humann && \ - du -h --max-depth=0 humann && \ - cd humann && \ - patch humann/search/nucleotide.py < /tmp/nucleotide.patch && \ - patch humann/config.py < /tmp/config.patch && \ - mv setup.py tmp.py && cat tmp.py | sed "s|2\.2\.3|2\.5\.1|g" > tmp2.py && \ - sed "s|bowtie2_folder\=\"bowtie2-2\.5\.1|bowtie2_folder\=\"bowtie2-2\.5\.1-linux-x86_64|g" tmp2.py > setup.py && \ - pip3 install -vvv . --no-binary :all: && \ - ldconfig /usr/local/lib && glpsol --version && \ - pip3 install numpy cython && pip3 install --no-cache-dir biom-format && \ - pip3 install metaphlan==${METAPHLANVERSION} && du --max-depth 1 -h ~/.cache/pip && \ - rm -r ~/.cache/pip +# Copy samtools binary and its needed libs +COPY --from=dbbuild /usr/local/bin/samtools /usr/local/bin/samtools -# The --help is so you can check to ensure that the grouping table are properly read -#RUN humann_config --update database_folders utility_mapping /dbs/util/ && \ -# humann_regroup_table --help +# Copy the entire pre-built venv +COPY --from=pybuild /opt/venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" -#RUN humann_test --run-functional-tests-end-to-end +# Install ONLY runtime dependencies — no build-essential, no git +RUN apt-get update && \ + DEBIAN_FRONTEND="noninteractive" apt-get install --no-install-recommends -y \ + libjpeg62-turbo \ + zlib1g \ + libbz2-1.0 \ + liblzma5 \ + ncbi-blast+ \ + mafft \ + raxml \ + libgomp1 \ + libglpk40 \ + glpk-utils && \ + rm -rf /var/lib/apt/lists/* && \ + ldconfig /usr/local/lib && \ + glpsol --version + +ENV LC_ALL=en_US.UTF-8 +ENV LANG=en_US.UTF-8 -#RUN cd humann-3.9 && ls examples && humann --input examples/demo.fasta.gz --output tmpout +WORKDIR /tmp diff --git a/biobakery-profiler/config_4.0.0.alpha.1-final-smaller.patch b/biobakery-profiler/config_4.0.0.alpha.1-final-smaller.patch new file mode 100644 index 0000000..c5246b8 --- /dev/null +++ b/biobakery-profiler/config_4.0.0.alpha.1-final-smaller.patch @@ -0,0 +1,14 @@ +105c105,107 +< user_edit_config_file="humann.cfg" +--- +> # User config file +> if not os.environ.get("HUMANN_CONFIG"): +> user_edit_config_file="humann.cfg" +107,108c109,112 +< full_path_user_edit_config_file=os.path.join(os.path.dirname(os.path.abspath(__file__)), +< user_edit_config_file) +--- +> full_path_user_edit_config_file=os.path.join(os.path.dirname(os.path.abspath(__file__)), +> user_edit_config_file) +> else: +> full_path_user_edit_config_file=os.environ.get("HUMANN_CONFIG") diff --git a/biobakery-profiler/nucleotide_4.0.0.alpha.1-final.patch b/biobakery-profiler/nucleotide_4.0.0.alpha.1-final-smaller.patch similarity index 100% rename from biobakery-profiler/nucleotide_4.0.0.alpha.1-final.patch rename to biobakery-profiler/nucleotide_4.0.0.alpha.1-final-smaller.patch diff --git a/biobakery_build_manifest.csv b/biobakery_build_manifest.csv index 527a0f5..bab1bb7 100644 --- a/biobakery_build_manifest.csv +++ b/biobakery_build_manifest.csv @@ -1,3 +1,3 @@ -biobakery-profiler,4.0.6--4.0.0.alpha.1-final +biobakery-profiler,4.0.6--4.0.0.alpha.1-final-smaller biobakery-profiler,4.1.0--v3.9 biobakery-profiler,4.0.5--3.6.1 diff --git a/build_manifest.csv b/build_manifest.csv index db43a15..8274370 100755 --- a/build_manifest.csv +++ b/build_manifest.csv @@ -11,6 +11,7 @@ rgi,6.0.0 bowtie2,2.5.0 snap-aligner,2.0.1 utility,0b +medi,2.1.0 phanta,1.1.0-dev portalclient,1.4.5 prokka_and_blast,0.1.1 diff --git a/medi/Dockerfile b/medi/Dockerfile new file mode 100644 index 0000000..9a4843e --- /dev/null +++ b/medi/Dockerfile @@ -0,0 +1,18 @@ +FROM --platform=linux/amd64 docker.io/condaforge/miniforge3:latest + +RUN mkdir /tmp/medi /tmp/medi/bin + +COPY medi.yml Makefile patches/*.patch /tmp/medi + +RUN mamba env create -n medi -f /tmp/medi/medi.yml && \ + . ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate medi && \ + cd /tmp/medi && make report && mv /tmp/medi/bin/kraken2-report /bin && \ + patch ${CONDA_DIR}/envs/medi/share/kraken2-2.1.3-4/libexec/build_kraken2_db.sh /tmp/medi/build.patch && \ + patch ${CONDA_DIR}/envs/medi/share/kraken2-2.1.3-4/libexec/download_genomic_library.sh /tmp/medi/download_genomic.patch && \ + conda clean --tarballs --index-cache --packages --yes && \ + find ${CONDA_DIR} -follow -type f -name '*.a' -delete && \ + find ${CONDA_DIR} -follow -type f -name '*.pyc' -delete && \ + conda clean --force-pkgs-dirs --all --yes && \ + rm -rf /tmp/medi + +ENV PATH="/opt/conda/envs/medi/bin:$PATH" \ No newline at end of file diff --git a/medi/Makefile b/medi/Makefile new file mode 100644 index 0000000..da2e861 --- /dev/null +++ b/medi/Makefile @@ -0,0 +1,15 @@ +CXX := g++ +K2DIR := "src/kraken2" +SRC := ${K2DIR}/src + +repo: + rm -rf ${K2DIR} + git clone https://github.com/daydream-boost/kraken2 ${K2DIR} + +report: repo + ${CXX} -O3 -std=c++11 \ + ${SRC}/mmap_file.cc ${SRC}/reports.cc ${SRC}/taxonomy.cc \ + ${SRC}/kraken2-report.cpp -o ./bin/kraken2-report + +clean: + rm -rf ${K2DIR} \ No newline at end of file diff --git a/medi/medi.yml b/medi/medi.yml new file mode 100644 index 0000000..d6156f6 --- /dev/null +++ b/medi/medi.yml @@ -0,0 +1,28 @@ +name: medi +channels: + - conda-forge + - bioconda +dependencies: + - patch + - python>=3.8 + - nextflow>=23.10.0 + - fastp + - kraken2==2.1.3 + - bracken==2.6.0 + - prodigal + - multiqc>=1.28.0 + - typeguard<=4.4.0 + - biopython + - sourmash + - pandas + - r-base + - r-data.table + - r-reutils + - r-r.utils + - r-rcurl + - r-magrittr + - r-futile.logger + - bioconductor-biostrings + - taxonkit>=0.20.0 + - architeuthis>=0.5.0 + - gxx \ No newline at end of file diff --git a/medi/patches/build.patch b/medi/patches/build.patch new file mode 100644 index 0000000..861dc0b --- /dev/null +++ b/medi/patches/build.patch @@ -0,0 +1,28 @@ +--- /home/gpfs/o_diener/code/k2/scripts/build_kraken2_db.sh 2025-11-17 14:42:43.458544663 +0100 ++++ /home/gpfs/o_diener/miniforge3/envs/medi/share/kraken2-2.1.3-4/libexec/build_kraken2_db.sh 2025-04-29 09:24:03.422237276 +0200 +@@ -32,7 +32,7 @@ + } + + function list_sequence_files() { +- find library/ '(' -name '*.fna' -o -name '*.faa' ')' -print0 ++ find -L library/ '(' -name '*.fna' -o -name '*.faa' ')' -print0 + } + + start_time=$(get_current_time) +@@ -66,14 +66,14 @@ + + echo "Creating sequence ID to taxonomy ID map (step 1)..." + if [ -d "library/added" ]; then +- find library/added/ -name 'prelim_map_*.txt' | xargs cat > library/added/prelim_map.txt ++ find -L library/added/ -name 'prelim_map_*.txt' | xargs cat > library/added/prelim_map.txt + fi + seqid2taxid_map_file=seqid2taxid.map + if [ -e "$seqid2taxid_map_file" ]; then + echo "Sequence ID to taxonomy ID map already present, skipping map creation." + else + step_time=$(get_current_time) +- find library/ -maxdepth 2 -name prelim_map.txt | xargs cat > taxonomy/prelim_map.txt ++ find -L library/ -maxdepth 2 -name prelim_map.txt | xargs cat > taxonomy/prelim_map.txt + if [ ! -s "taxonomy/prelim_map.txt" ]; then + echo "No preliminary seqid/taxid mapping files found, aborting." + exit 1 \ No newline at end of file diff --git a/medi/patches/download_genomic.patch b/medi/patches/download_genomic.patch new file mode 100644 index 0000000..6ef5b7a --- /dev/null +++ b/medi/patches/download_genomic.patch @@ -0,0 +1,11 @@ +--- /home/gpfs/o_diener/code/k2/scripts/download_genomic_library.sh 2025-11-17 14:42:43.476888231 +0100 ++++ /home/gpfs/o_diener/miniforge3/envs/medi/share/kraken2-2.1.3-4/libexec/download_genomic_library.sh 2025-04-09 12:42:41.740766021 +0200 +@@ -68,7 +68,7 @@ + else + awk '{ print $NF }' .listing | perl -ple 'tr/\r//d' | grep '\.fna\.gz' > manifest.txt + fi +- cat manifest.txt | xargs -n1 -I{} wget -q $FTP_SERVER/genomes/refseq/plasmid/{} ++ cat manifest.txt | xargs -n1 -I{} wget -q https://$NCBI_SERVER/genomes/refseq/plasmid/{} + cat manifest.txt | xargs -n1 -I{} gunzip -c {} > $library_file + rm -f plasmid.* .listing + scan_fasta_file.pl $library_file > prelim_map.txt \ No newline at end of file