Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions SparkLeBLASTSearch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# module load jdk


usage() { echo "Usage: ./SparkLeBLASTSearch.sh -q /path/to/query -db /path/to/formatted/db -gop gap_open -gex gap_extend -nalign num_alignments -m master_address (launches new Spark cluster if null) -w <num_workers> -time <Time in integer minutes> -h hostname_prefix -d /path/to/logs/dir (default current dir)" 1>&2; exit 1; }
usage() { echo "Usage: ./SparkLeBLASTSearch.sh -q /path/to/query -db /path/to/formatted/db -dbs /path/to/dbs/file -gop gap_open -gex gap_extend -nalign num_alignments -m master_address (launches new Spark cluster if null) -w <num_workers> -time <Time in integer minutes> -h hostname_prefix -d /path/to/logs/dir (default current dir)" 1>&2; exit 1; }


while [[ $# -gt 0 ]]; do
Expand All @@ -21,6 +21,11 @@ while [[ $# -gt 0 ]]; do
shift # past argument
shift # past value
;;
-dbs|--meta)
DBS="$2"
shift # past argument
shift # past value
;;
-o|--output)
OUTPUT_PATH="$2"
shift # past argument
Expand Down Expand Up @@ -59,7 +64,7 @@ done

# Required Args check:
# --------------------
if [ -z "${QUERY}" ] || [ -z "${DATABASE}" ]; then
if [ -z "${QUERY}" ] || [ -z "${DATABASE}" || [ -z "${DBS}" ]]; then
usage
fi

Expand Down Expand Up @@ -153,14 +158,14 @@ fi

# Partitions IDs Prefix
partitionsIDs="_partitionsIDs"
dbLen=$(head -n 1 "${DATABASE}/database.dbs")
numSeq=$(tail -n 1 "${DATABASE}/database.dbs")
dbLen=$(head -n 1 "${DBS}/database.dbs")
numSeq=$(tail -n 1 "${DBS}/database.dbs")
outfmt=6 # Hard coded for now since only tabular is currently supported
max_target_seqs=$(grep -o -P 'max_target_seqs.{0,5}' ${SLB_WORKDIR}/blast_args.txt | grep -o [0-9]*) # Support up to 4-digits (9999) max_target_seqs_value

# Submit Spark job to perform blast search
echo "Running Blast Search"
${SPARK_HOME}/bin/spark-submit --master ${SPARK_MASTER_ADDRESS} --verbose --conf "spark.executor.instances=1" --conf "spark.driver.extraJavaOptions=-XX:MaxHeapSize=30g" --conf "spark.worker.extraJavaOptions=-XX:MaxHeapSize=30g" --conf "spark.driver.memory=29g" --conf "spark.executor.memory=29g" --class SparkLeBLASTSearch ${SLB_WORKDIR}/target/scala-2.11/simple-project_2.11-1.0.jar "${DATABASE}${partitionsIDs}" ${QUERY} ${DATABASE} "${SLB_WORKDIR}/blastSearchScript" ${dbLen} ${numSeq} ${outfmt} ${max_target_seqs} ${NCBI_BLAST_PATH} ${SLB_WORKDIR} ${OUTPUT_PATH}
${SPARK_HOME}/bin/spark-submit --master ${SPARK_MASTER_ADDRESS} --verbose --conf "spark.executor.instances=1" --conf "spark.driver.extraJavaOptions=-XX:MaxHeapSize=30g" --conf "spark.worker.extraJavaOptions=-XX:MaxHeapSize=30g" --conf "spark.driver.memory=29g" --conf "spark.executor.memory=29g" --class SparkLeBLASTSearch ${SLB_WORKDIR}/target/scala-2.11/simple-project_2.11-1.0.jar "${DBS}${partitionsIDs}" ${QUERY} ${DATABASE} "${SLB_WORKDIR}/blastSearchScript" ${dbLen} ${numSeq} ${outfmt} ${max_target_seqs} ${NCBI_BLAST_PATH} ${SLB_WORKDIR} ${OUTPUT_PATH}
echo "Blast Search Done"

if [ ! -z ${SPARK_SLURM_PATH} ]; then
Expand Down
38 changes: 20 additions & 18 deletions containers/singularity/fsub.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
#!/bin/bash

set -x

DBFILE=$1
QUERYFILE=$2
NPROC=$3
ELAPSE=${4:-30:00}

USAGE="$0 \${NPROC}"
if [ -z ${NPROC} ]; then
echo "NPROC not set!"
Expand All @@ -28,15 +25,12 @@ if email=$(git config --get user.email); then
else
echo "$0 WARNING: git email not set!"
fi

if [ ${NPROC} -gt 384 ]; then
RSCGRP=large;
else
RSCGRP=small;
fi

OUTPUT_DIR=output

NAME=sparkle-${NPROC}
PJSUB_ARGS=(
-N ${NAME}
Expand All @@ -55,26 +49,34 @@ PJSUB_ARGS=(
--mpi proc=${NPROC}
${email_args}
)

if [[ "${CLEARALL^^}" =~ ^(YES|ON|TRUE)$ ]]; then
if [[ "${CLEARALL^^}" =~ ^(YES|ON|TRUE)$ ]]; then
# must be outside pjsub
rm -rf output run log work data/makedb_out data/search_out
rm -rf output run log work data/search_out
fi

mkdir -p ${OUTPUT_DIR}
pjsub ${PJSUB_ARGS[@]} << EOF
OF_PROC=${OUTPUT_DIR}/\${PJM_JOBID}-${NAME}/mpi

if [ ! -e ./data/makedb_out/${DBFILE}_$(( NPROC - 1 ))/database.dbs ]; then
jobid=$(pjsub "${PJSUB_ARGS[@]}" << EOF
OF_PROC=${OUTPUT_DIR}/\${PJM_JOBID}-${NAME}/mpi
echo "THIS IS WHAT YOU CARE ABUT: \${OF_PROC}"
mkdir -p log run work \$(dirname \${OF_PROC})

mpiexec -of-proc \${OF_PROC} ./gatherhosts_ips hosts-\${PJM_JOBID}
mpiexec -of-proc \${OF_PROC} ./start_spark_cluster.sh &
bash -x ./run_spark_jobs.sh ${DBFILE} ${QUERYFILE}
bash -x ./run_spark_jobs_makedb.sh ${DBFILE} ${QUERYFILE}
# mpiexec -of-proc \${OF_PROC} ./stop_spark_cluster.sh &
rm -rf master_success-\${PJM_JOBID}
echo FSUB IS DONE
EOF

# DBFILE=non-rRNA-reads.fa
# Galaxy25-\[Geobacter_metallireducens.fasta\].fasta
# QUERYFILE=sample_text.fa
)
jobid=$(echo "$jobid" | grep -oE '[0-9]+' | tail -n1)
echo $jobid
pjwait ${jobid}
fi
pjsub "${PJSUB_ARGS[@]}" << EOF
OF_PROC=${OUTPUT_DIR}/\${PJM_JOBID}-${NAME}/mpi
mkdir -p log run work \$(dirname \${OF_PROC})
mpiexec -of-proc \${OF_PROC} ./gatherhosts_ips hosts-\${PJM_JOBID}
bash -x ./run_spark_jobs.sh ${DBFILE} ${QUERYFILE} \${OF_PROC}
rm -rf master_success-\${PJM_JOBID}
echo FSUB IS DONE
EOF
Loading