Skip to content
145 changes: 77 additions & 68 deletions CPU/juicer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -628,62 +628,62 @@ fi

#Skip if post-processing only is required
if [ -z $postproc ]
then
# if we haven't already zipped up merged_dedup
if [ ! -s ${outputdir}/merged_dedup.bam ]
then
# Check that dedupping worked properly
# in ideal world, we would check this in split_rmdups and not remove before we know they are correct
size1=$(samtools view $sthreadstring -h ${outputdir}/merged_sort.bam | wc -l | awk '{print $1}')
size2=$(wc -l ${outputdir}/merged_dedup.sam | awk '{print $1}')

if [ $size1 -ne $size2 ]
then
echo "***! Error! The sorted file and dups/no dups files do not add up, or were empty."
exit 1
fi
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
else
if [ ! -s ${outputdir}/merged1.txt ]
then
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
fi
if [ ! -s ${outputdir}/merged30.txt ]
then
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
fi
then
# if we haven't already zipped up merged_dedup
if [ ! -s ${outputdir}/merged_dedup.bam ]
then
# Check that dedupping worked properly
# in ideal world, we would check this in split_rmdups and not remove before we know they are correct
size1=$(samtools view $sthreadstring -h ${outputdir}/merged_sort.bam | wc -l | awk '{print $1}')
size2=$(wc -l ${outputdir}/merged_dedup.sam | awk '{print $1}')
if [ $size1 -ne $size2 ]
then
echo "***! Error! The sorted file and dups/no dups files do not add up, or were empty."
exit 1
fi
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
else
if [ ! -s ${outputdir}/merged1.txt ]
then
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
fi
if [ ! -s ${outputdir}/merged30.txt ]
then
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
fi
fi

if [[ $threadsHic -gt 1 ]]
then
if [ ! -s ${outputdir}/merged1_index.txt ]
then
time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged1.txt 500000 > ${outputdir}/merged1_index.txt
fi
if [ ! -s ${outputdir}/merged30_index.txt ]
then
time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged30.txt 500000 > ${outputdir}/merged30_index.txt
fi
if [ ! -s ${outputdir}/merged1_index.txt ]
then
time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged1.txt 500000 > ${outputdir}/merged1_index.txt
fi
if [ ! -s ${outputdir}/merged30_index.txt ]
then
time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged30.txt 500000 > ${outputdir}/merged30_index.txt
fi
fi

if [ ! -s ${outputdir}/merged_dedup.bam ]
then
if samtools view -b $sthreadstring ${outputdir}/merged_dedup.sam > ${outputdir}/merged_dedup.bam
then
rm ${outputdir}/merged_dedup.sam
rm ${outputdir}/merged_sort.bam
fi
if samtools view -b $sthreadstring ${outputdir}/merged_dedup.sam > ${outputdir}/merged_dedup.bam
then
rm ${outputdir}/merged_dedup.sam
rm ${outputdir}/merged_sort.bam
fi
fi

if [ "$methylation" = 1 ]
then
$load_methyl
samtools sort $sthreadstring ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam
$call_methyl extract -F 1024 --keepSingleton --keepDiscordant $refSeq ${outputdir}/merged_dedup_sort.bam
$call_methyl extract -F 1024 --keepSingleton --keepDiscordant --cytosine_report --CHH --CHG $refSeq ${outputdir}/merged_dedup_sort.bam
${juiceDir}/scripts/common/conversion.sh ${outputdir}/merged_dedup_sort.cytosine_report.txt > ${outputdir}/conversion_report.txt
rm ${outputdir}/merged_dedup_sort.bam ${outputdir}/merged_dedup_sort.cytosine_report.txt*
$load_methyl
samtools sort $sthreadstring ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam
$call_methyl extract -F 1024 --keepSingleton --keepDiscordant $refSeq ${outputdir}/merged_dedup_sort.bam
$call_methyl extract -F 1024 --keepSingleton --keepDiscordant --cytosine_report --CHH --CHG $refSeq ${outputdir}/merged_dedup_sort.bam
${juiceDir}/scripts/common/conversion.sh ${outputdir}/merged_dedup_sort.cytosine_report.txt > ${outputdir}/conversion_report.txt
rm ${outputdir}/merged_dedup_sort.bam ${outputdir}/merged_dedup_sort.cytosine_report.txt*
fi


Expand All @@ -692,24 +692,24 @@ if [ -z $postproc ]
tail -n1 $headfile | awk '{printf"%-1000s\n", $0}' > $outputdir/inter.txt
if [ $singleend -eq 1 ]
then
ret=$(samtools view $sthreadstring -f 1024 -F 256 $outputdir/merged_dedup.bam | awk '{if ($0~/rt:A:7/){singdup++}else{dup++}}END{print dup,singdup}')
dups=$(echo $ret | awk '{print $1}')
singdups=$(echo $ret | awk '{print $2}')
cat $splitdir/*.res.txt | awk -v dups=$dups -v singdups=$singdups -v ligation=$ligation -v singleend=1 -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
else
dups=$(samtools view -c -f 1089 -F 256 $sthreadstring $outputdir/merged_dedup.bam)
cat $splitdir/*.res.txt | awk -v dups=$dups -v ligation=$ligation -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
ret=$(samtools view $sthreadstring -f 1024 -F 256 $outputdir/merged_dedup.bam | awk '{if ($0~/rt:A:7/){singdup++}else{dup++}}END{print dup,singdup}')
dups=$(echo $ret | awk '{print $1}')
singdups=$(echo $ret | awk '{print $2}')
cat $splitdir/*.res.txt | awk -v dups=$dups -v singdups=$singdups -v ligation=$ligation -v singleend=1 -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
else
dups=$(samtools view -c -f 1089 -F 256 $sthreadstring $outputdir/merged_dedup.bam)
cat $splitdir/*.res.txt | awk -v dups=$dups -v ligation=$ligation -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
fi

cp $outputdir/inter.txt $outputdir/inter_30.txt

if [ $assembly -eq 1 ]
then
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt none
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt none
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt none
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt none
else
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt $genomePath
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt $genomePath
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt $genomePath
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt $genomePath
fi

# if early exit, we stop here, once the stats are calculated
Expand All @@ -718,28 +718,28 @@ if [ -z $postproc ]
if [ $assembly -eq 1 ]
then
samtools view $sthreadstring -O SAM -F 1024 $outputdir/merged_dedup.*am | awk -v mnd=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged_nodups.txt
fi
export splitdir=${splitdir}; export outputdir=${outputdir}; export early=1;
if ${juiceDir}/scripts/common/check.sh && [ "$cleanup" = 1 ]
then
${juiceDir}/scripts/common/cleanup.sh
fi
exit
fi
export splitdir=${splitdir}; export outputdir=${outputdir}; export early=1;
if ${juiceDir}/scripts/common/check.sh && [ "$cleanup" = 1 ]
then
${juiceDir}/scripts/common/cleanup.sh
fi
exit
fi
export IBM_JAVA_OPTIONS="-Xmx150000m -Xgcthreads1"
export _JAVA_OPTIONS="-Xmx150000m -Xms150000m"
mkdir ${outputdir}"/HIC_tmp"
if [ "$qc" = 1 ] || [ "$insitu" = 1 ]
then
resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000"
resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000"
else
resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100"
resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100"
fi
if [ "$nofrag" -eq 1 ]
then
fragstr=""
fragstr=""
else
fragstr="-f $site_file"
fragstr="-f $site_file"
fi

time ${juiceDir}/scripts/common/juicer_tools pre -n -s $outputdir/inter.txt -g $outputdir/inter_hists.m $fragstr $resstr $threadHicString $outputdir/merged1.txt $outputdir/inter.hic $genomePath
Expand All @@ -755,8 +755,17 @@ if [ -z $postproc ]
# POSTPROCESSING
if [ "$qc" != 1 ]
then
${juiceDir}/scripts/common/juicer_postprocessing.sh -j ${juiceDir}/scripts/common/juicer_tools -i ${outputdir}/inter_30.hic -m ${juiceDir}/references/motif -g ${genomeID}
fi
${juiceDir}/scripts/common/juicer_postprocessing.sh -j ${juiceDir}/scripts/common/juicer_tools -i ${outputdir}/inter_30.hic -m ${juiceDir}/references/motif -g ${genomeID}

# build mapq 30 accessiblity as part of postprocessing for Ultima
# needs kentUtils
if [ $singleend -eq 1 ] && [[ "$site" == "none" ]] && [[ ! -z "$genomePath" ]]
then
awk 'BEGIN{OFS="\t"}{cut[$2" "$3]++; cut[$6" "$7]++}END{for(i in cut){split(i, arr, " "); print arr[1], arr[2]-1, arr[2], cut[i]}}' merged30.txt | sort -k1,1 -k2,2n --parallel=$threads -S6G > merged30.bedgraph
bedGraphToBigWig merged30.bedgraph $genomePath inter_30.bw
fi
fi
]
fi
#CHECK THAT PIPELINE WAS SUCCESSFUL
export early=$earlyexit
Expand Down
49 changes: 42 additions & 7 deletions CPU/mega_from_bams_diploid.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
{
#### Description: Wrapper script to phase genomic variants from ENCODE DCC hic-pipeline.
#### Description: Wrapper script to calculate a "mega" hic file from a set of bams, with optional diploid portion.
#### Usage: bash ./mega.sh [-v|--vcf <path_to_vcf>] <path_to_merged_dedupped_bam_1> ... <path_to_merged_dedup_bam_N>.
#### Input: list of merged_dedup.bam files corresponding to individual Hi-C experiments.
#### Output: "mega" hic map and "mega" chromatin accessibility bw file.
Expand All @@ -16,7 +16,7 @@ USAGE="
*****************************************************
Simplified mega script for ENCODE DCC hic-pipeline.

USAGE: ./mega.sh -c|--chrom-sizes <path_to_chrom_sizes_file> [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf <path_to_vcf>] [-C|--exclude-chr-from-diploid chr_list] [--separate-homologs] [-p|--psf <path_to_psf>] [--reads-to-homologs <path_to_reads_to_homologs_file>] [--juicer-dir <path_to_juicer_dir>] [--phaser-dir <path_to_phaser_dir>] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--from-stage stage] [--to-stage stage] <path_to_merged_dedup_bam_1> ... <path_to_merged_dedup_bam_N>
USAGE: ./mega.sh -c|--chrom-sizes <path_to_chrom_sizes_file> [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf <path_to_vcf>] [-C|--exclude-chr-from-diploid chr_list] [--separate-homologs] [-p|--psf <path_to_psf>] [--reads-to-homologs <path_to_reads_to_homologs_file>] [--juicer-dir <path_to_juicer_dir>] [--phaser-dir <path_to_phaser_dir>] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--shortcut-stats-1 inter.hic_1,...inter.hic_N] [--shortcut-stats-30 inter_30.hic_1,...inter_30.hic_N] [--from-stage <stage>] [--to-stage <stage>] <path_to_merged_dedup_bam_1> ... <path_to_merged_dedup_bam_N>

DESCRIPTION:
This is a simplied mega.sh script to produce aggregate Hi-C maps and chromatic accessibility tracks from multiple experiments. The pipeline includes optional diploid steps which produce diploid versions of the contact maps and chromatin accessibility tracks.
Expand All @@ -39,6 +39,12 @@ HAPLOID PORTION
-r|--resolutions [string]
Comma-separated resolutions at which to build the hic files. Default: 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10.

--shortcut-stats-1 [inter.hic_1,...inter.hic_N]
Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=1.

--shortcut-stats-30 [inter_30.hic_1,...inter_30.hic_N]
Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=30.

DIPLOID PORTION:
-v|--vcf [path_to_vcf]
Path to a Variant Call Format (vcf) file containing phased sequence variation data, e.g. as generated by the ENCODE DCC Hi-C variant calling & phasing pipeline. Passing a vcf file invokes a diploid section of the script.
Expand Down Expand Up @@ -78,6 +84,7 @@ WORKFLOW CONTROL:

# defaults:
resolutionsToBuildString="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10"
genome_id="hg38"
exclude_chr="Y|chrY|MT|chrM"
separate_homologs=false
mapq=1 ##lowest mapq of interest
Expand Down Expand Up @@ -138,6 +145,16 @@ while :; do
resolutionsToBuildString="-r "$OPTARG
shift
;;
--shortcut-stats-1) OPTARG=$2
echo "... --shortcut-stats-1 flag was triggered with $OPTARG value." >&1
shortcut_stats_1=$OPTARG
shift
;;
--shortcut-stats-30) OPTARG=$2
echo "... --shortcut-stats-30 flag was triggered with $OPTARG value." >&1
shortcut_stats_30=$OPTARG
shift
;;
## DIPLOID PORTION
-v|--vcf) OPTARG=$2
if [ -s $OPTARG ] && [[ $OPTARG == *.vcf ]]; then
Expand Down Expand Up @@ -332,7 +349,7 @@ if [ "$first_stage" == "hic" ]; then
export SHELL=$(type -p bash)
doit () {
mapq=$2
samtools view -@ 2 -h -q $mapq reads.sorted.bam $1 | awk -F '\t' -v mapq=$mapq '{for(i=12;i<=NF;i++){if($i~/^ip:i:/){ip=substr($i,6)}else if ($i~/^mp:i:/){mp=substr($i,6)}else if ($i~/^MQ:i:/){mq=substr($i,6)}}}(mq<mapq){next}$7=="="{if(ip>mp){next}else if (ip==mp){keep[$1]=$0}else{print 0, $3, ip, 0, 0, $3, mp, 1};next}$7<$3{next}{print 0, $3, ip, 0, 0, $7, mp, 1 > "/dev/stderr"}END{for(i in keep){n=split(keep[i],a,"\t"); for(s=12;s<=n;s++){if(a[s]~"^ip:i:"){ip=substr(a[s],6)}}; print 0, a[3], ip, 0, 0, a[3], ip, 1}}'
samtools view -@ 2 -q $mapq reads.sorted.bam $1 | awk -F '\t' -v mapq=$mapq '{ip=0; mp=0; mq=-1; cb=0; chimeric=0; for(i=12;i<=NF;i++){if($i~/^ip:i:/){ip=substr($i,6)}else if ($i~/^mp:i:/){mp=substr($i,6)}else if ($i~/^MQ:i:/){mq=substr($i,6)}else if ($i~/cb:Z:/){cb=i}else if ($i~/SA:Z:/){chimeric=i}}}(mq<mapq){next}$7=="*"{if(!chimeric){next}else{split(substr($chimeric,6),a,","); $7=a[1]}}$7=="="{$7=$3}!cb{next}{cbchr1=gensub("_","","g",$3);cbchr2=gensub("_","","g",$7);testcb1="cb:Z:"cbchr1"_"cbchr2"_"; testcb2="cb:Z:"cbchr2"_"cbchr1"_"}($cb!~testcb1&&$cb!~testcb2){next}(!ip||!mp){next}$7==$3{if(ip>mp){next}else if (ip==mp){keep[$1]=$3" "ip}else{print 0, $3, ip, 0, 0, $7, mp, 1};next}$7<$3{next}{print 0, $3, ip, 0, 0, $7, mp, 1 > "/dev/stderr"}END{for(rd in keep){n=split(keep[rd], a, " "); print 0, a[1], a[2], 0, 0, a[1], a[2], 1}}'
}

export -f doit
Expand Down Expand Up @@ -365,8 +382,16 @@ if [ "$first_stage" == "hic" ]; then
# awk '{print NR}END{print NR+1}' $chrom_sizes | parallel --will-cite "rm out.{}.txt err.{}.txt"
# rm temp.log

touch inter.txt inter_hists.m inter_30.txt inter_hists_30.m

touch inter.txt inter_hists.m
if [ ! -z ${shortcut_stats_1} ]; then
tmpstr=""
IFS=',' read -ra STATS <<< "$shortcut_stats_1"
for i in "${STATS[@]}"; do
[ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; }
done
[ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter ${tmpstr:1}
fi

if [[ $threadsHic -gt 1 ]] && [[ ! -s merged1_index.txt ]]
then
"${juicer_dir}"/scripts/index_by_chr.awk merged1.txt 500000 > merged1_index.txt
Expand All @@ -376,9 +401,19 @@ if [ "$first_stage" == "hic" ]; then
threadHicString=""
fi

touch inter_30.txt inter_30_hists.m
if [ ! -z ${shortcut_stats_30} ]; then
tmpstr=""
IFS=',' read -ra STATS <<< "$shortcut_stats_30"
for i in "${STATS[@]}"; do
[ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; }
done
[ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter_30 ${tmpstr:1}
fi

if [[ $threadsHic -gt 1 ]] && [[ ! -s merged30_index.txt ]]
then
"${juicer_dir}"/scripts/common/index_by_chr.awk merged30.txt 500000 > merged30_index.txt
"${juicer_dir}"/scripts/index_by_chr.awk merged30.txt 500000 > merged30_index.txt
tempdirPre30="HIC30_tmp" && mkdir "${tempdirPre30}"
threadHic30String="--threads $threadsHic -i merged30_index.txt -t ${tempdirPre30}"
else
Expand All @@ -396,7 +431,7 @@ if [ "$first_stage" == "hic" ]; then
## TODO: check for failure

"${juicer_dir}"/scripts/juicer_tools pre -n -s inter_30.txt -g inter_30_hists.m -q 30 "$resolutionsToBuildString" "$threadHic30String" merged30.txt inter_30.hic "$chrom_sizes"
"${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic "${outputDir}"/inter_30.hic
"${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic inter_30.hic
rm -Rf "${tempdirPre30}"
## TODO: check for failure

Expand Down
Loading