From b5c2023357887a3ed13d3941715b29b9efe0ff2f Mon Sep 17 00:00:00 2001 From: dudcha Date: Mon, 17 Jan 2022 21:44:05 -0600 Subject: [PATCH 01/10] First path adding stats shortcuts --- CPU/mega_from_bams_diploid.sh | 42 +++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/CPU/mega_from_bams_diploid.sh b/CPU/mega_from_bams_diploid.sh index 22e0b8f..2e09750 100755 --- a/CPU/mega_from_bams_diploid.sh +++ b/CPU/mega_from_bams_diploid.sh @@ -1,6 +1,6 @@ #!/bin/bash { -#### Description: Wrapper script to phase genomic variants from ENCODE DCC hic-pipeline. +#### Description: Wrapper script to calculate a "mega" hic file from a set of bams, with optional diploid portion. #### Usage: bash ./mega.sh [-v|--vcf ] ... . #### Input: list of merged_dedup.bam files corresponding to individual Hi-C experiments. #### Output: "mega" hic map and "mega" chromatin accessibility bw file. @@ -16,7 +16,7 @@ USAGE=" ***************************************************** Simplified mega script for ENCODE DCC hic-pipeline. -USAGE: ./mega.sh -c|--chrom-sizes [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf ] [-C|--exclude-chr-from-diploid chr_list] [--separate-homologs] [-p|--psf ] [--reads-to-homologs ] [--juicer-dir ] [--phaser-dir ] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--from-stage stage] [--to-stage stage] ... +USAGE: ./mega.sh -c|--chrom-sizes [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf ] [-C|--exclude-chr-from-diploid chr_list] [--separate-homologs] [-p|--psf ] [--reads-to-homologs ] [--juicer-dir ] [--phaser-dir ] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--shortcut-stats-1 inter.hic_1,...inter.hic_N] [--shortcut-stats-30 inter_30.hic_1,...inter_30.hic_N] [--from-stage ] [--to-stage ] ... DESCRIPTION: This is a simplied mega.sh script to produce aggregate Hi-C maps and chromatic accessibility tracks from multiple experiments. The pipeline includes optional diploid steps which produce diploid versions of the contact maps and chromatin accessibility tracks. @@ -39,6 +39,12 @@ HAPLOID PORTION -r|--resolutions [string] Comma-separated resolutions at which to build the hic files. Default: 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10. +--shortcut-stats-1 [inter.hic_1,...inter.hic_N] + Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=1. + +--shortcut-stats-30 [inter_30.hic_1,...inter_30.hic_N] + Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=30. + DIPLOID PORTION: -v|--vcf [path_to_vcf] Path to a Variant Call Format (vcf) file containing phased sequence variation data, e.g. as generated by the ENCODE DCC Hi-C variant calling & phasing pipeline. Passing a vcf file invokes a diploid section of the script. @@ -138,6 +144,16 @@ while :; do resolutionsToBuildString="-r "$OPTARG shift ;; + --shortcut-stats-1) OPTARG=$2 + echo "... --shortcut-stats-1 flag was triggered with $OPTARG value." >&1 + shortcut_stats_1=$OPTARG + shift + ;; + --shortcut-stats-30) OPTARG=$2 + echo "... --shortcut-stats-30 flag was triggered with $OPTARG value." >&1 + shortcut_stats_30=$OPTARG + shift + ;; ## DIPLOID PORTION -v|--vcf) OPTARG=$2 if [ -s $OPTARG ] && [[ $OPTARG == *.vcf ]]; then @@ -365,8 +381,16 @@ if [ "$first_stage" == "hic" ]; then # awk '{print NR}END{print NR+1}' $chrom_sizes | parallel --will-cite "rm out.{}.txt err.{}.txt" # rm temp.log - touch inter.txt inter_hists.m inter_30.txt inter_hists_30.m - + touch inter.txt inter_hists.m + if [ ! -z ${inter_stats_1} ]; then + tmpstr="" + IFS=';' read -ra STATS <<< "$inter_stats_1" + for i in "${STATS[@]}"; do + [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the hic files indicated for calculating stats does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; } + done + [[ "$tmpstr" != "" ]] && java -jar ${juicer_dir}/merge-stats.jar inter ${tmpstr:1:-1} + fi + if [[ $threadsHic -gt 1 ]] && [[ ! -s merged1_index.txt ]] then "${juicer_dir}"/scripts/index_by_chr.awk merged1.txt 500000 > merged1_index.txt @@ -376,6 +400,16 @@ if [ "$first_stage" == "hic" ]; then threadHicString="" fi + touch inter_30.txt inter_hists_30.m + if [ ! -z ${inter_stats_30} ]; then + tmpstr="" + IFS=';' read -ra STATS <<< "$inter_stats_30" + for i in "${STATS[@]}"; do + [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the hic files indicated for calculating stats does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; } + done + [[ "$tmpstr" != "" ]] && java -jar ${juicer_dir}/merge-stats.jar inter_30 ${tmpstr:1:-1} + fi + if [[ $threadsHic -gt 1 ]] && [[ ! -s merged30_index.txt ]] then "${juicer_dir}"/scripts/common/index_by_chr.awk merged30.txt 500000 > merged30_index.txt From d004e710fe0e3a224a96562859bc0b7c4ddacc3d Mon Sep 17 00:00:00 2001 From: dudcha Date: Mon, 17 Jan 2022 22:54:03 -0600 Subject: [PATCH 02/10] cleaning up stats calculation --- CPU/mega_from_bams_diploid.sh | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/CPU/mega_from_bams_diploid.sh b/CPU/mega_from_bams_diploid.sh index 2e09750..5ce8325 100755 --- a/CPU/mega_from_bams_diploid.sh +++ b/CPU/mega_from_bams_diploid.sh @@ -84,6 +84,7 @@ WORKFLOW CONTROL: # defaults: resolutionsToBuildString="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10" +genome_id="hg38" exclude_chr="Y|chrY|MT|chrM" separate_homologs=false mapq=1 ##lowest mapq of interest @@ -382,13 +383,13 @@ if [ "$first_stage" == "hic" ]; then # rm temp.log touch inter.txt inter_hists.m - if [ ! -z ${inter_stats_1} ]; then + if [ ! -z ${shortcut_stats_1} ]; then tmpstr="" - IFS=';' read -ra STATS <<< "$inter_stats_1" + IFS=',' read -ra STATS <<< "$shortcut_stats_1" for i in "${STATS[@]}"; do - [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the hic files indicated for calculating stats does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; } + [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; } done - [[ "$tmpstr" != "" ]] && java -jar ${juicer_dir}/merge-stats.jar inter ${tmpstr:1:-1} + [ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter ${tmpstr:1} fi if [[ $threadsHic -gt 1 ]] && [[ ! -s merged1_index.txt ]] @@ -400,19 +401,19 @@ if [ "$first_stage" == "hic" ]; then threadHicString="" fi - touch inter_30.txt inter_hists_30.m - if [ ! -z ${inter_stats_30} ]; then + touch inter_30.txt inter_30_hists.m + if [ ! -z ${shortcut_stats_30} ]; then tmpstr="" - IFS=';' read -ra STATS <<< "$inter_stats_30" + IFS=',' read -ra STATS <<< "$shortcut_stats_30" for i in "${STATS[@]}"; do - [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the hic files indicated for calculating stats does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; } + [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; } done - [[ "$tmpstr" != "" ]] && java -jar ${juicer_dir}/merge-stats.jar inter_30 ${tmpstr:1:-1} + [ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter_30 ${tmpstr:1} fi if [[ $threadsHic -gt 1 ]] && [[ ! -s merged30_index.txt ]] then - "${juicer_dir}"/scripts/common/index_by_chr.awk merged30.txt 500000 > merged30_index.txt + "${juicer_dir}"/scripts/index_by_chr.awk merged30.txt 500000 > merged30_index.txt tempdirPre30="HIC30_tmp" && mkdir "${tempdirPre30}" threadHic30String="--threads $threadsHic -i merged30_index.txt -t ${tempdirPre30}" else @@ -430,7 +431,7 @@ if [ "$first_stage" == "hic" ]; then ## TODO: check for failure "${juicer_dir}"/scripts/juicer_tools pre -n -s inter_30.txt -g inter_30_hists.m -q 30 "$resolutionsToBuildString" "$threadHic30String" merged30.txt inter_30.hic "$chrom_sizes" - "${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic "${outputDir}"/inter_30.hic + "${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic inter_30.hic rm -Rf "${tempdirPre30}" ## TODO: check for failure From 8a0384e9a584fc1abaf915914bd10b8f2e2c9d91 Mon Sep 17 00:00:00 2001 From: dudcha Date: Wed, 19 Jan 2022 21:15:46 -0600 Subject: [PATCH 03/10] add haploid accessibility to postprocessing --- CPU/juicer.sh | 143 +++++++++++++++++++++++++++----------------------- 1 file changed, 76 insertions(+), 67 deletions(-) diff --git a/CPU/juicer.sh b/CPU/juicer.sh index 1faf5cd..8defc98 100755 --- a/CPU/juicer.sh +++ b/CPU/juicer.sh @@ -627,62 +627,62 @@ fi #Skip if post-processing only is required if [ -z $postproc ] - then - # if we haven't already zipped up merged_dedup - if [ ! -s ${outputdir}/merged_dedup.bam ] - then - # Check that dedupping worked properly - # in ideal world, we would check this in split_rmdups and not remove before we know they are correct - size1=$(samtools view $sthreadstring -h ${outputdir}/merged_sort.bam | wc -l | awk '{print $1}') - size2=$(wc -l ${outputdir}/merged_dedup.sam | awk '{print $1}') - - if [ $size1 -ne $size2 ] - then - echo "***! Error! The sorted file and dups/no dups files do not add up, or were empty." - exit 1 - fi - samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt - samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt - else - if [ ! -s ${outputdir}/merged1.txt ] - then - samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt - fi - if [ ! -s ${outputdir}/merged30.txt ] - then - samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt - fi +then + # if we haven't already zipped up merged_dedup + if [ ! -s ${outputdir}/merged_dedup.bam ] + then + # Check that dedupping worked properly + # in ideal world, we would check this in split_rmdups and not remove before we know they are correct + size1=$(samtools view $sthreadstring -h ${outputdir}/merged_sort.bam | wc -l | awk '{print $1}') + size2=$(wc -l ${outputdir}/merged_dedup.sam | awk '{print $1}') + + if [ $size1 -ne $size2 ] + then + echo "***! Error! The sorted file and dups/no dups files do not add up, or were empty." + exit 1 + fi + samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt + samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt + else + if [ ! -s ${outputdir}/merged1.txt ] + then + samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt + fi + if [ ! -s ${outputdir}/merged30.txt ] + then + samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt + fi fi if [[ $threadsHic -gt 1 ]] then - if [ ! -s ${outputdir}/merged1_index.txt ] - then - time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged1.txt 500000 > ${outputdir}/merged1_index.txt - fi - if [ ! -s ${outputdir}/merged30_index.txt ] - then - time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged30.txt 500000 > ${outputdir}/merged30_index.txt - fi + if [ ! -s ${outputdir}/merged1_index.txt ] + then + time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged1.txt 500000 > ${outputdir}/merged1_index.txt + fi + if [ ! -s ${outputdir}/merged30_index.txt ] + then + time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged30.txt 500000 > ${outputdir}/merged30_index.txt + fi fi if [ ! -s ${outputdir}/merged_dedup.bam ] then - if samtools view -b $sthreadstring ${outputdir}/merged_dedup.sam > ${outputdir}/merged_dedup.bam - then - rm ${outputdir}/merged_dedup.sam - rm ${outputdir}/merged_sort.bam - fi + if samtools view -b $sthreadstring ${outputdir}/merged_dedup.sam > ${outputdir}/merged_dedup.bam + then + rm ${outputdir}/merged_dedup.sam + rm ${outputdir}/merged_sort.bam + fi fi if [ "$methylation" = 1 ] then - $load_methyl - samtools sort $sthreadstring ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam - $call_methyl extract -F 1024 --keepSingleton --keepDiscordant $refSeq ${outputdir}/merged_dedup_sort.bam - $call_methyl extract -F 1024 --keepSingleton --keepDiscordant --cytosine_report --CHH --CHG $refSeq ${outputdir}/merged_dedup_sort.bam - ${juiceDir}/scripts/common/conversion.sh ${outputdir}/merged_dedup_sort.cytosine_report.txt > ${outputdir}/conversion_report.txt - rm ${outputdir}/merged_dedup_sort.bam ${outputdir}/merged_dedup_sort.cytosine_report.txt* + $load_methyl + samtools sort $sthreadstring ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam + $call_methyl extract -F 1024 --keepSingleton --keepDiscordant $refSeq ${outputdir}/merged_dedup_sort.bam + $call_methyl extract -F 1024 --keepSingleton --keepDiscordant --cytosine_report --CHH --CHG $refSeq ${outputdir}/merged_dedup_sort.bam + ${juiceDir}/scripts/common/conversion.sh ${outputdir}/merged_dedup_sort.cytosine_report.txt > ${outputdir}/conversion_report.txt + rm ${outputdir}/merged_dedup_sort.bam ${outputdir}/merged_dedup_sort.cytosine_report.txt* fi @@ -691,24 +691,24 @@ if [ -z $postproc ] tail -n1 $headfile | awk '{printf"%-1000s\n", $0}' > $outputdir/inter.txt if [ $singleend -eq 1 ] then - ret=$(samtools view $sthreadstring -f 1024 -F 256 $outputdir/merged_dedup.bam | awk '{if ($0~/rt:A:7/){singdup++}else{dup++}}END{print dup,singdup}') - dups=$(echo $ret | awk '{print $1}') - singdups=$(echo $ret | awk '{print $2}') - cat $splitdir/*.res.txt | awk -v dups=$dups -v singdups=$singdups -v ligation=$ligation -v singleend=1 -f ${juiceDir}/scripts/stats_sub.awk >> $outputdir/inter.txt + ret=$(samtools view $sthreadstring -f 1024 -F 256 $outputdir/merged_dedup.bam | awk '{if ($0~/rt:A:7/){singdup++}else{dup++}}END{print dup,singdup}') + dups=$(echo $ret | awk '{print $1}') + singdups=$(echo $ret | awk '{print $2}') + cat $splitdir/*.res.txt | awk -v dups=$dups -v singdups=$singdups -v ligation=$ligation -v singleend=1 -f ${juiceDir}/scripts/stats_sub.awk >> $outputdir/inter.txt else - dups=$(samtools view -c -f 1089 -F 256 $sthreadstring $outputdir/merged_dedup.bam) - cat $splitdir/*.res.txt | awk -v dups=$dups -v ligation=$ligation -f ${juiceDir}/scripts/stats_sub.awk >> $outputdir/inter.txt + dups=$(samtools view -c -f 1089 -F 256 $sthreadstring $outputdir/merged_dedup.bam) + cat $splitdir/*.res.txt | awk -v dups=$dups -v ligation=$ligation -f ${juiceDir}/scripts/stats_sub.awk >> $outputdir/inter.txt fi cp $outputdir/inter.txt $outputdir/inter_30.txt if [ $assembly -eq 1 ] then - ${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt none - ${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt none + ${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt none + ${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt none else - ${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt $genomePath - ${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt $genomePath + ${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt $genomePath + ${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt $genomePath fi # if early exit, we stop here, once the stats are calculated @@ -717,28 +717,28 @@ if [ -z $postproc ] if [ $assembly -eq 1 ] then samtools view $sthreadstring -O SAM -F 1024 $outputdir/merged_dedup.*am | awk -v mnd=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged_nodups.txt - fi - export splitdir=${splitdir}; export outputdir=${outputdir}; export early=1; - if ${juiceDir}/scripts/common/check.sh && [ "$cleanup" = 1 ] - then - ${juiceDir}/scripts/common/cleanup.sh - fi - exit + fi + export splitdir=${splitdir}; export outputdir=${outputdir}; export early=1; + if ${juiceDir}/scripts/common/check.sh && [ "$cleanup" = 1 ] + then + ${juiceDir}/scripts/common/cleanup.sh + fi + exit fi export IBM_JAVA_OPTIONS="-Xmx150000m -Xgcthreads1" export _JAVA_OPTIONS="-Xmx150000m -Xms150000m" mkdir ${outputdir}"/HIC_tmp" if [ "$qc" = 1 ] || [ "$insitu" = 1 ] then - resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000" + resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000" else - resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100" + resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100" fi if [ "$nofrag" -eq 1 ] then - fragstr="" + fragstr="" else - fragstr="-f $site_file" + fragstr="-f $site_file" fi time ${juiceDir}/scripts/common/juicer_tools pre -n -s $outputdir/inter.txt -g $outputdir/inter_hists.m $fragstr $resstr $threadHicString $outputdir/merged1.txt $outputdir/inter.hic $genomePath @@ -754,8 +754,17 @@ if [ -z $postproc ] # POSTPROCESSING if [ "$qc" != 1 ] then - ${juiceDir}/scripts/common/juicer_postprocessing.sh -j ${juiceDir}/scripts/common/juicer_tools -i ${outputdir}/inter_30.hic -m ${juiceDir}/references/motif -g ${genomeID} - fi + ${juiceDir}/scripts/common/juicer_postprocessing.sh -j ${juiceDir}/scripts/common/juicer_tools -i ${outputdir}/inter_30.hic -m ${juiceDir}/references/motif -g ${genomeID} + + # build mapq 30 accessiblity as part of postprocessing for Ultima + # needs kentUtils + if [ $singleend -eq 1 ] && [[ "$site" == "none" ]] && [[ ! -z "$genomePath" ]] + then + awk 'BEGIN{OFS="\t"}{cut[$2" "$3]++; cut[$6" "$7]++}END{for(i in cut){split(i, arr, " "); print arr[1], arr[2]-1, arr[2], cut[i]}}' merged30.txt | sort -k1,1 -k2,2n --parallel=$threads -S6G > merged30.bedgraph + bedGraphToBigWig merged30.bedgraph $genomePath inter_30.bw + fi + fi +] fi #CHECK THAT PIPELINE WAS SUCCESSFUL export early=$earlyexit From 365a28f8605fcba247887586e1550b4e8bbc5990 Mon Sep 17 00:00:00 2001 From: dudcha Date: Thu, 20 Jan 2022 13:56:27 -0600 Subject: [PATCH 04/10] Last sync before breaking up --- CPU/mega_from_bams_diploid.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CPU/mega_from_bams_diploid.sh b/CPU/mega_from_bams_diploid.sh index 5ce8325..73b93b9 100755 --- a/CPU/mega_from_bams_diploid.sh +++ b/CPU/mega_from_bams_diploid.sh @@ -349,7 +349,7 @@ if [ "$first_stage" == "hic" ]; then export SHELL=$(type -p bash) doit () { mapq=$2 - samtools view -@ 2 -h -q $mapq reads.sorted.bam $1 | awk -F '\t' -v mapq=$mapq '{for(i=12;i<=NF;i++){if($i~/^ip:i:/){ip=substr($i,6)}else if ($i~/^mp:i:/){mp=substr($i,6)}else if ($i~/^MQ:i:/){mq=substr($i,6)}}}(mqmp){next}else if (ip==mp){keep[$1]=$0}else{print 0, $3, ip, 0, 0, $3, mp, 1};next}$7<$3{next}{print 0, $3, ip, 0, 0, $7, mp, 1 > "/dev/stderr"}END{for(i in keep){n=split(keep[i],a,"\t"); for(s=12;s<=n;s++){if(a[s]~"^ip:i:"){ip=substr(a[s],6)}}; print 0, a[3], ip, 0, 0, a[3], ip, 1}}' + samtools view -@ 2 -q $mapq reads.sorted.bam $1 | awk -F '\t' -v mapq=$mapq '{ip=0; mp=0; mq=-1; cb=0; chimeric=0; for(i=12;i<=NF;i++){if($i~/^ip:i:/){ip=substr($i,6)}else if ($i~/^mp:i:/){mp=substr($i,6)}else if ($i~/^MQ:i:/){mq=substr($i,6)}else if ($i~/cb:Z:/){cb=i}else if ($i~/SA:Z:/){chimeric=i}}}(mqmp){next}else if (ip==mp){keep[$1]=$3" "ip}else{print 0, $3, ip, 0, 0, $7, mp, 1};next}$7<$3{next}{print 0, $3, ip, 0, 0, $7, mp, 1 > "/dev/stderr"}END{for(rd in keep){n=split(keep[rd], a, " "); print 0, a[1], a[2], 0, 0, a[1], a[2], 1}}' } export -f doit From 128effde376ef7b7f03cd2b168dabe69ab3c9027 Mon Sep 17 00:00:00 2001 From: dudcha Date: Thu, 20 Jan 2022 13:58:21 -0600 Subject: [PATCH 05/10] duplicate code to extract megafication --- CPU/megafy.sh | 700 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 700 insertions(+) create mode 100755 CPU/megafy.sh diff --git a/CPU/megafy.sh b/CPU/megafy.sh new file mode 100755 index 0000000..73b93b9 --- /dev/null +++ b/CPU/megafy.sh @@ -0,0 +1,700 @@ +#!/bin/bash +{ +#### Description: Wrapper script to calculate a "mega" hic file from a set of bams, with optional diploid portion. +#### Usage: bash ./mega.sh [-v|--vcf ] ... . +#### Input: list of merged_dedup.bam files corresponding to individual Hi-C experiments. +#### Output: "mega" hic map and "mega" chromatin accessibility bw file. +#### Optional input: phased vcf file. When passed launches generation of diploid versions of output. +#### Dependencies: Java, Samtools, GNU Parallel, KentUtils, Juicer, 3D-DNA (for diploid portion only). +#### Written by: + +echo "*****************************************************" >&1 +echo "cmd log: "$0" "$* >&1 +echo "*****************************************************" >&1 + +USAGE=" +***************************************************** +Simplified mega script for ENCODE DCC hic-pipeline. + +USAGE: ./mega.sh -c|--chrom-sizes [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf ] [-C|--exclude-chr-from-diploid chr_list] [--separate-homologs] [-p|--psf ] [--reads-to-homologs ] [--juicer-dir ] [--phaser-dir ] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--shortcut-stats-1 inter.hic_1,...inter.hic_N] [--shortcut-stats-30 inter_30.hic_1,...inter_30.hic_N] [--from-stage ] [--to-stage ] ... + +DESCRIPTION: +This is a simplied mega.sh script to produce aggregate Hi-C maps and chromatic accessibility tracks from multiple experiments. The pipeline includes optional diploid steps which produce diploid versions of the contact maps and chromatin accessibility tracks. + +ARGUMENTS: +path_to_merged_dedup_bam + Path to bam file containing deduplicated alignments of Hi-C reads in bam format (output by Juicer2). Multiple bam files are expected to be passed as arguments. + +OPTIONS: +-h|--help + Shows this help. + +HAPLOID PORTION +-g|--genome-id [genome_id] + Genome id, e.g. hg38, for some of the common references used by Juicer. Used to run the motif finder. + +-c|--chrom-sizes [path_to_chrom_sizes_file] + Path to chrom.sizes file for the reference used when processing the individual Hi-C experiments. + +-r|--resolutions [string] + Comma-separated resolutions at which to build the hic files. Default: 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10. + +--shortcut-stats-1 [inter.hic_1,...inter.hic_N] + Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=1. + +--shortcut-stats-30 [inter_30.hic_1,...inter_30.hic_N] + Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=30. + +DIPLOID PORTION: +-v|--vcf [path_to_vcf] + Path to a Variant Call Format (vcf) file containing phased sequence variation data, e.g. as generated by the ENCODE DCC Hi-C variant calling & phasing pipeline. Passing a vcf file invokes a diploid section of the script. + +-C|--exclude-chr-from-diploid [chr_list] + Remove specific molecules from the chromosome list (default: chrY). Note that -c and -C are incompatible in that if both are invoked, only the last listed option will be taken into account. Deafult: chrY. + +--separate-homologs + Build two separate contact maps & two separate accessibility tracks for homologs as opposed to a single diploid contact map with interleaved homologous chromosomes. This is the preferred mode for comparing contacts across homologs using the \"Observed over Control\" view for map comparison in Juicebox. Importantly, assignment of chromosomes to homologs is arbitrary. Default: not invoked. +-p|--psf [path_to_psf] + Path to a 3D-DNA phaser psf file containing heterozygous variant and phasing information. Optional input to fast-forward some steps & save compute on processing the vcf file. + +--reads-to-homologs [path_to_reads_to_homologs_file] + Path to a reads_to_homologs file generated by the phasing pipeline. Optional input to fast-forward some steps & save compute on diploid processing (assumes the input bams were used for phasing). + +WORKFLOW CONTROL: +-t|--threads [num] + Indicate how many threads to use. Default: half of available cores as calculated by parallel --number-of-cores. + +-T|--threads-hic [num] + Indicate how many threads to use when generating the Hi-C file. Default: 24. + +--juicer-dir [path_to_juicer_dir] + Path to Juicer directory, contains scripts/, references/, and restriction_sites/ + +--phaser-dir [path_to_3ddna_dir] + Path to 3D-DNA directory, contains phase/ + +--from-stage [pipeline_stage] + Fast-forward to a particular stage of the pipeline. The pipeline_stage argument can be \"prep\", \"hic\", \"hicnarrow\", \"dhs\", \"diploid_hic\", \"diploid_dhs\", \"cleanup\". + +--to-stage [pipeline_stage] + Exit after a particular stage of the pipeline. The argument can be \"prep\", \"hic\", \"hicnarrow\", \"dhs\", \"diploid_hic\", \"diploid_dhs\", \"cleanup\". + +***************************************************** +" + +# defaults: +resolutionsToBuildString="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10" +genome_id="hg38" +exclude_chr="Y|chrY|MT|chrM" +separate_homologs=false +mapq=1 ##lowest mapq of interest + +# multithreading +threads=`parallel --number-of-cores` +threads=$((threads/2)) +# adjust for mem usage +tmp=`awk '/MemTotal/ {threads=int($2/1024/1024/2/6-1)}END{print threads+0}' /proc/meminfo 2>/dev/null` +tmp=$((tmp+0)) +([ $tmp -gt 0 ] && [ $tmp -lt $threads ]) && threads=$tmp +threadsHic=$threads + +## Create temporary directory for parallelization +# tmpdir="HIC_tmp" +# if [ ! -d "$tmpdir" ]; then +# mkdir "$tmpdir" +# chmod 777 "$tmpdir" +# fi +# export TMPDIR=${tmpdir} + +#staging +first_stage="prep" +last_stage="cleanup" +declare -A stage +stage[prep]=0 +stage[hic]=1 +stage[hicnarrow]=2 +stage[dhs]=3 +stage[diploid_hic]=4 +stage[diploid_dhs]=5 +stage[cleanup]=6 + +############### HANDLE OPTIONS ############### + +while :; do + case $1 in + -h|--help) + echo "$USAGE" >&1 + exit 0 + ;; +## HAPLOID PORTION + -g|--genome-id) OPTARG=$2 + genome_id=$OPTARG + shift + ;; + -c|--chrom-sizes) OPTARG=$2 + if [ -s $OPTARG ] && [[ $OPTARG == *.chrom.sizes ]]; then + echo "... -c|--chrom-sizes flag was triggered with $OPTARG value." >&1 + chrom_sizes=$OPTARG + else + echo " :( Chrom.sizes file is not found at expected location, is empty or does not have the expected extension. Exiting!" >&2 + exit 1 + fi + shift + ;; + -r|--resolutions) OPTARG=$2 + resolutionsToBuildString="-r "$OPTARG + shift + ;; + --shortcut-stats-1) OPTARG=$2 + echo "... --shortcut-stats-1 flag was triggered with $OPTARG value." >&1 + shortcut_stats_1=$OPTARG + shift + ;; + --shortcut-stats-30) OPTARG=$2 + echo "... --shortcut-stats-30 flag was triggered with $OPTARG value." >&1 + shortcut_stats_30=$OPTARG + shift + ;; +## DIPLOID PORTION + -v|--vcf) OPTARG=$2 + if [ -s $OPTARG ] && [[ $OPTARG == *.vcf ]]; then + echo "... -v|--vcf flag was triggered, will try to generate diploid versions of the hic file and accessibility track based on phasing data in $OPTARG." >&1 + vcf=$OPTARG + else + echo " :( Vcf file is not found at expected location, is empty or does not have the expected extension. Exiting!" >&2 + exit 1 + fi + shift + ;; + -C|--exclude-chr-from-diploid) OPTARG=$2 + echo "... -C|--exclude-chr-from-diploid flag was triggered, will ignore variants on $OPTARG." >&1 + exclude_chr=$OPTARG + shift + ;; + --separate-homologs) + echo "... --separate-homologs flag was triggered, will build two separate contact maps and two separate accessibility tracks (_r.hic/_a.hic and _r.bw/_a.bw) for chromosomal homologs with identical chromosomal labels." >&1 + separate_homologs=true + ;; + -p|--psf) OPTARG=$2 + if [ -s $OPTARG ] && [[ $OPTARG == *.psf ]]; then + echo "... -p|--psf flag was triggered, will try to generate diploid versions of the hic file and accessibility track based on phasing data in $OPTARG." >&1 + psf=$OPTARG + else + echo " :( Psf file is not found at expected location, is empty or does not have the expected extension. Exiting!" >&2 + exit 1 + fi + shift + ;; + --reads-to-homologs) OPTARG=$2 + if [ -s $OPTARG ] && [[ $OPTARG == *.txt ]]; then + echo "... --reads-to-homologs flag was triggered, will try to generate diploid versions of the hic file and accessibility track based on reads-to-homolog data in $OPTARG." >&1 + reads_to_homologs=$OPTARG + else + echo " :( File is not found at expected location, is empty or does not have the expected extension. Exiting!" >&2 + exit 1 + fi + shift + ;; +## WORKFLOW + -t|--threads) OPTARG=$2 + re='^[0-9]+$' + if [[ $OPTARG =~ $re ]]; then + echo "... -t|--threads flag was triggered, will try to parallelize across $OPTARG threads." >&1 + threads=$OPTARG + else + echo " :( Wrong syntax for thread count parameter value. Exiting!" >&2 + exit 1 + fi + shift + ;; + -T|--threads-hic) OPTARG=$2 + re='^[0-9]+$' + if [[ $OPTARG =~ $re ]]; then + echo "... -T|--threads-hic flag was triggered, will try to parallelize across $OPTARG threads when building hic map." >&1 + threadsHic=$OPTARG + else + echo " :( Wrong syntax for hic thread count parameter value. Exiting!" >&2 + exit 1 + fi + shift + ;; + --juicer-dir) OPTARG=$2 + if [ -d $OPTARG ]; then + echo "... --juicer-dir flag was triggered with $OPTARG." >&1 + juicer_dir=$OPTARG + else + exit 1 + echo " :( Juicer folder not found at expected location. Exiting!" >&2 + fi + shift + ;; + --phaser-dir) OPTARG=$2 + if [ -d $OPTARG ]; then + echo "... --phaser-dir flag was triggered with $OPTARG." >&1 + phaser_dir=$OPTARG + else + exit 1 + echo " :( Juicer folder not found at expected location. Exiting!" >&2 + fi + shift + ;; + --from-stage) OPTARG=$2 + if [ "$OPTARG" == "prep" ] || [ "$OPTARG" == "hic" ] || [ "$OPTARG" == "hicnarrow" ] || [ "$OPTARG" == "dhs" ] || [ "$OPTARG" == "diploid_hic" ] || [ "$OPTARG" == "diploid_dhs" ] || [ "$OPTARG" == "cleanup" ]; then + echo "... --from-stage flag was triggered. Will fast-forward to $OPTARG." >&1 + first_stage=$OPTARG + else + echo " :( Whong syntax for pipeline stage. Please use prep/hic/hicnarrow/dhs/diploid_hic/diploid_dhs/cleanup. Exiting!" >&2 + exit 1 + fi + shift + ;; + --to-stage) OPTARG=$2 + if [ "$OPTARG" == "prep" ] || [ "$OPTARG" == "hic" ] || [ "$OPTARG" == "hicnarrow" ] || [ "$OPTARG" == "dhs" ] || [ "$OPTARG" == "diploid_hic" ] || [ "$OPTARG" == "diploid_dhs" ] || [ "$OPTARG" == "cleanup" ]; then + echo "... --to-stage flag was triggered. Will exit after $OPTARG." >&1 + last_stage=$OPTARG + else + echo " :( Whong syntax for pipeline stage. Please use prep/hic/hicnarrow/dhs/diploid_hic/diploid_dhs/cleanup. Exiting!" >&2 + exit 1 + fi + shift + ;; +### utilitarian + --) # End of all options + shift + break + ;; + -?*) + echo ":| WARNING: Unknown option. Ignoring: ${1}" >&2 + ;; + *) # Default case: If no more options then break out of the loop. + break + esac + shift +done + +## optional TODO: give error if diploid options are invoked without a vcf file + +if [[ "${stage[$first_stage]}" -gt "${stage[$last_stage]}" ]]; then + echo >&2 ":( Please make sure that the first stage requested is in fact an earlier stage of the pipeline to the one requested as last. Exiting!" + exit 1 +fi + +[ -z $chrom_sizes ] && { echo >&2 ":( Chrom.sizes file is not optional. Please use the -c flag to point to the chrom.sizes file used when running Juicer. Exiting!"; exit 1; } + +[ -z $genome_id ] && { echo >&2 ":| Warning: no genome_id is provided. Please provide a genome_id if using one of the common references to be able to run the motif finder. Ignoring motif finder!"; } + +############### HANDLE DEPENDENCIES ############### + +## Juicer & Phaser + +[ -z $juicer_dir ] && { echo >&2 ":( Juicer directory is not specified. Exiting!"; exit 1; } +([ ! -z $vcf ] && [ -z $phaser_dir ]) && { echo >&2 ":( Phaser directory is not specified. Exiting!"; exit 1; } + +## Java Dependency +type java >/dev/null 2>&1 || { echo >&2 ":( Java is not available, please install/add to path Java. Exiting!"; exit 1; } + +## GNU Parallel Dependency +type parallel >/dev/null 2>&1 || { echo >&2 ":( GNU Parallel support is set to true (default) but GNU Parallel is not in the path. Please install GNU Parallel or set -p option to false. Exiting!"; exit 1; } +[ $(parallel --version | awk 'NR==1{print $3}') -ge 20150322 ] || { echo >&2 ":( Outdated version of GNU Parallel is installed. Please install/add to path v 20150322 or later. Exiting!"; exit 1; } + +## Samtools Dependency +type samtools >/dev/null 2>&1 || { echo >&2 ":( Samtools are not available, please install/add to path. Exiting!"; exit 1; } +ver=`samtools --version | awk 'NR==1{print \$NF}'` +[[ $(echo "$ver < 1.13" |bc -l) -eq 1 ]] && { echo >&2 ":( Outdated version of samtools is installed. Please install/add to path v 1.13 or later. Exiting!"; exit 1; } + +## kentUtils Dependency +type bedGraphToBigWig >/dev/null 2>&1 || { echo >&2 ":( bedGraphToBigWig is not available, please install/add to path, e.g. from kentUtils. Exiting!"; exit 1; } + +############### HANDLE ARGUMENTS ############### + +bam=`echo "${@:1}"` +##TODO: check file extentions + +############### MAIN ################# +## 0. PREP BAM FILE + +if [ "$first_stage" == "prep" ]; then + + echo "...Extracting unique paired alignments from bams and sorting..." >&1 + + # make header for the merged file pipe + parallel --will-cite "samtools view -H {} > {}_header.bam" ::: $bam + header_list=`parallel --will-cite "printf %s' ' {}_header.bam" ::: $bam` + samtools merge --no-PG -f mega_header.bam ${header_list} + rm ${header_list} + + samtools cat -@ $((threads * 2)) -h mega_header.bam $bam | samtools view -u -d "rt:0" -d "rt:1" -d "rt:2" -d "rt:3" -d "rt:4" -d "rt:5" -@ $((threads * 2)) -F 0x400 -q $mapq - | samtools sort -@ $threads -m 6G -o reads.sorted.bam + [ `echo "${PIPESTATUS[@]}" | tr -s ' ' + | bc` -eq 0 ] || { echo ":( Pipeline failed at bam sorting. See stderr for more info. Exiting!" | tee -a /dev/stderr && exit 1; } + rm mega_header.bam + + samtools index -@ $threads reads.sorted.bam + [ $? -eq 0 ] || { echo ":( Failed at bam indexing. See stderr for more info. Exiting!" | tee -a /dev/stderr && exit 1; } + # e.g. will fail with chr longer than ~500Mb. Use samtools index -c -m 14 reads.sorted.bam + + echo ":) Done extracting unique paired alignments from bam and sorting." >&1 + + [ "$last_stage" == "prep" ] && { echo "Done with the requested workflow. Exiting after prepping bam!"; exit; } + first_stage="hic" + +fi + +## I. HIC + +if [ "$first_stage" == "hic" ]; then + + echo "...Generating mega hic file..." >&1 + + ([ -f reads.sorted.bam ] && [ -f reads.sorted.bam.bai ]) || { echo ":( Files from previous stages of the pipeline appear to be missing. Exiting!" | tee -a /dev/stderr; exit 1; } + + export SHELL=$(type -p bash) + doit () { + mapq=$2 + samtools view -@ 2 -q $mapq reads.sorted.bam $1 | awk -F '\t' -v mapq=$mapq '{ip=0; mp=0; mq=-1; cb=0; chimeric=0; for(i=12;i<=NF;i++){if($i~/^ip:i:/){ip=substr($i,6)}else if ($i~/^mp:i:/){mp=substr($i,6)}else if ($i~/^MQ:i:/){mq=substr($i,6)}else if ($i~/cb:Z:/){cb=i}else if ($i~/SA:Z:/){chimeric=i}}}(mqmp){next}else if (ip==mp){keep[$1]=$3" "ip}else{print 0, $3, ip, 0, 0, $7, mp, 1};next}$7<$3{next}{print 0, $3, ip, 0, 0, $7, mp, 1 > "/dev/stderr"}END{for(rd in keep){n=split(keep[rd], a, " "); print 0, a[1], a[2], 0, 0, a[1], a[2], 1}}' + } + + export -f doit + + ## opt1 + ## extra variable contains all (small) sequences that are not already in the chrom.sizes file. There is no use in them for generating the hic file, they are generated only for consistensy (and potentially stats) + extra=`samtools view -H reads.sorted.bam | grep '^@SQ' | sed "s/.*SN:\([^\t]*\).*/\1/g" | awk 'FILENAME==ARGV[1]{drop[$1]=1;next}!($1 in drop)' ${chrom_sizes} - | xargs` + + #merged1.txt + awk -v extra="$extra" '{print $1}END{print extra}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit {} 1 >merged1.txt 2>merged1.tmp.txt + exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` + [ $exitval -eq 0 ] || { echo ":( Pipeline failed at generating the mega mnd file. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } + rm temp.log + sort -k2,2 -k6,6 -S 6G --parallel=${threads} merged1.tmp.txt >> merged1.txt && rm merged1.tmp.txt + + #merged30.txt + ## @MUHAMMAD: why do we have two now rather than running pre from the same one? + awk -v extra="$extra" '{print $1}END{print extra}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit {} 30 >merged30.txt 2>merged30.tmp.txt + exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` + [ $exitval -eq 0 ] || { echo ":( Pipeline failed at generating the mega mnd file. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } + rm temp.log + sort -k2,2 -k6,6 -S 6G --parallel=${threads} merged30.tmp.txt >> merged30.txt && rm merged30.tmp.txt + + ## opt2 + # awk -v extra=$extra '{print $1}END{print extra}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log "doit {} 1 >out.{#}.txt 2>err.{#}.txt" + # exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` + # [ $exitval -eq 0 ] || { echo ":( Pipeline failed at building diploid contact maps. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } + # awk '{print NR}END{print NR+1}' $chrom_sizes | parallel --will-cite -k "cat out.{}.txt" >merged1.txt + # awk '{print NR}END{print NR+1}' $chrom_sizes | parallel --will-cite -k "sort -k6,6 -S6G err.{}.txt" >>merged1.txt + # awk '{print NR}END{print NR+1}' $chrom_sizes | parallel --will-cite "rm out.{}.txt err.{}.txt" + # rm temp.log + + touch inter.txt inter_hists.m + if [ ! -z ${shortcut_stats_1} ]; then + tmpstr="" + IFS=',' read -ra STATS <<< "$shortcut_stats_1" + for i in "${STATS[@]}"; do + [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; } + done + [ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter ${tmpstr:1} + fi + + if [[ $threadsHic -gt 1 ]] && [[ ! -s merged1_index.txt ]] + then + "${juicer_dir}"/scripts/index_by_chr.awk merged1.txt 500000 > merged1_index.txt + tempdirPre="HIC_tmp" && mkdir $tempdirPre + threadHicString="--threads $threadsHic -i merged1_index.txt -t ${tempdirPre}" + else + threadHicString="" + fi + + touch inter_30.txt inter_30_hists.m + if [ ! -z ${shortcut_stats_30} ]; then + tmpstr="" + IFS=',' read -ra STATS <<< "$shortcut_stats_30" + for i in "${STATS[@]}"; do + [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; } + done + [ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter_30 ${tmpstr:1} + fi + + if [[ $threadsHic -gt 1 ]] && [[ ! -s merged30_index.txt ]] + then + "${juicer_dir}"/scripts/index_by_chr.awk merged30.txt 500000 > merged30_index.txt + tempdirPre30="HIC30_tmp" && mkdir "${tempdirPre30}" + threadHic30String="--threads $threadsHic -i merged30_index.txt -t ${tempdirPre30}" + else + threadHic30String="" + fi + + ##@MUHAMMAD: + export IBM_JAVA_OPTIONS="-Xmx50000m -Xgcthreads1" + export _JAVA_OPTIONS="-Xmx50000m -Xms50000m" + ##@MUHAMMAD: + + "${juicer_dir}"/scripts/juicer_tools pre -n -s inter.txt -g inter_hists.m -q 1 "$resolutionsToBuildString" "$threadHicString" merged1.txt inter.hic "$chrom_sizes" + "${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic inter.hic + rm -Rf "${tempdirPre}" + ## TODO: check for failure + + "${juicer_dir}"/scripts/juicer_tools pre -n -s inter_30.txt -g inter_30_hists.m -q 30 "$resolutionsToBuildString" "$threadHic30String" merged30.txt inter_30.hic "$chrom_sizes" + "${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic inter_30.hic + rm -Rf "${tempdirPre30}" + ## TODO: check for failure + + echo ":) Done building mega hic files." >&1 + + [ "$last_stage" == "hic" ] && { echo "Done with the requested workflow. Exiting after generating the mega.hic file!"; exit; } + first_stage="hicnarrow" +fi + +## II. HICNARROW. +if [ "$first_stage" == "hicnarrow" ]; then + + echo "...Annotating loops and domains..." >&1 + + [ -f inter_30.hic ] || { echo ":( Files from previous stages of the pipeline appear to be missing. Exiting!" | tee -a /dev/stderr; exit 1; } + + [ -z $genome_id ] && { echo ":( This step requires a genome ID. Please provide (e.g. \"-g hg38\") or skip this step. Exiting!" | tee -a /dev/stderr; exit 1; } + + # Create loop lists file for MQ > 30 + "${juicer_dir}"/scripts/juicer_hiccups.sh -j "${juicer_dir}"/scripts/juicer_tools -i inter_30.hic -m "${juicer_dir}"/references/motif -g "$genome_id" + + ##TODO: check for failure + + "${juicer_dir}"/scripts/juicer_arrowhead.sh -j "${juicer_dir}"/scripts/juicer_tools -i inter_30.hic + + ##TODO: check for failure + + echo ":) Done annotating loops and domains." >&1 + + [ "$last_stage" == "hicnarrow" ] && { echo "Done with the requested workflow. Exiting after generating loop and domain annotations!"; exit; } + first_stage="dhs" + +fi + +## III. BUILD HAPLOID ACCESSIBILITY TRACKS +if [ "$first_stage" == "dhs" ]; then + + echo "...Building accessibility tracks..." >&1 + + ([ -f reads.sorted.bam ] && [ -f reads.sorted.bam.bai ]) || { echo ":( Files from previous stages of the pipeline appear to be missing. Exiting!" | tee -a /dev/stderr; exit 1; } + + ## figure out platform + pl=`samtools view -H reads.sorted.bam | grep '^@RG' | sed "s/.*PL:\([^\t]*\).*/\1/g" | sed "s/ILM/ILLUMINA/g;s/Illumina/ILLUMINA/g;s/LS454/454/g" | uniq` + ([ "$pl" == "ILLUMINA" ] || [ "$pl" == "454" ]) || { echo ":( Platform name is not recognized or data from different platforms seems to be mixed. Can't handle this case. Exiting!" | tee -a /dev/stderr && exit 1; } + [ "$pl" == "ILLUMINA" ] && junction_rt_string="-d rt:2 -d rt:3 -d rt:4 -d rt:5" || junction_rt_string="-d rt:0 -d rt:1" + + export SHELL=$(type -p bash) + export junction_rt_string=${junction_rt_string} + doit () { + mapq=$2 + samtools view -@ 2 ${junction_rt_string} -q $mapq -h reads.sorted.bam $1 | awk 'BEGIN{OFS="\t"}{for (i=12; i<=NF; i++) {if ($i ~ /^ip/) {split($i, ip, ":"); locus[ip[3]]++; break}}}END{for (i in locus) {print $3, i-1, i, locus[i]}}' | sort -k2,2n -S6G + } + export -f doit + + # mapq1 accessibility track + awk '{print $1}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit {} 1 > tmp.bedgraph + exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` + [ $exitval -eq 0 ] || { echo ":( Pipeline failed at building diploid contact maps. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } + rm temp.log + bedGraphToBigWig tmp.bedgraph $chrom_sizes inter.bw + rm tmp.bedgraph + + # mapq30 accessibility track + awk '{print $1}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit {} 30 > tmp.bedgraph + exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` + [ $exitval -eq 0 ] || { echo ":( Pipeline failed at building diploid contact maps. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } + rm temp.log + bedGraphToBigWig tmp.bedgraph $chrom_sizes inter_30.bw + rm tmp.bedgraph + + echo ":) Done building accessibility tracks." >&1 + + [ "$last_stage" == "dhs" ] && { echo "Done with the requested workflow. Exiting after building haploid accessibility tracks!"; exit; } + ([ -z $vcf ] && [ -z $psf ] && [ -z ${reads_to_homologs} ]) && first_stage="cleanup" || first_stage="diploid_hic" + +fi + +## IV. BUILD DIPLOID CONTACT MAPS +if [ "$first_stage" == "diploid_hic" ]; then + + echo "...Building diploid contact maps from reads overlapping phased SNPs..." >&1 + + if [ ! -s reads.sorted.bam ] || [ ! -s reads.sorted.bam.bai ] ; then + echo ":( Files from previous stages of the pipeline appear to be missing. Exiting!" | tee -a /dev/stderr + exit 1 + fi + ## @PAUL: which chrom.sizes file is used? Maybe it's fine..? + + chr=`awk -v exclude_chr=${exclude_chr} 'BEGIN{split(exclude_chr,a,"|"); for(i in a){ignore[a[i]]=1}}!($1 in ignore){str=str"|"$1}END{print substr(str,2)}' ${chrom_sizes}` + + if [ -z $reads_to_homologs ]; then + + if [ -z $psf ]; then + echo " ... Parsing vcf..." + awk -v chr=${chr} -v output_prefix="out" -f ${phaser_dir}/phase/vcf-to-psf-and-assembly.awk ${vcf} + echo " ... :) Done parsing vcf!" + psf=out.psf + rm out.assembly + fi + + echo " ... Extracting reads overlapping SNPs..." + export SHELL=$(type -p bash) + export psf=${psf} + export pipeline=${phaser_dir} + doit () { + samtools view -@ 2 reads.sorted.bam $1 | awk -f ${pipeline}/phase/extract-SNP-reads-from-sam-file.awk ${psf} - + } + export -f doit + echo $chr | tr "|" "\n" | parallel -j $threads --will-cite --joblog temp.log doit > dangling.sam + exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` + [ $exitval -eq 0 ] || { echo ":( Pipeline failed at parsing bam. Check stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } + rm temp.log + + bash ${phaser_dir}/phase/assign-reads-to-homologs.sh -t ${threads} -c ${chr} $psf dangling.sam + reads_to_homologs=reads_to_homologs.txt + echo " ... :) Done extracting reads overlapping SNPs!" + + fi + + # build mnd file: can do without sort -n, repeat what was done in hic stage. + export SHELL=$(type -p bash) + export psf=${psf} + export pipeline=${phaser_dir} + export reads_to_homologs=$reads_to_homologs + doit () { + samtools view -@ 2 -h reads.sorted.bam $1 | awk -v chr=$1 'BEGIN{OFS="\t"}FILENAME==ARGV[1]{if($2==chr"-r"||$2==chr"-a"){if(keep[$1]&&keep[$1]!=$2){delete keep[$1]}else{keep[$1]=$2}};next}$0~/^@SQ/{$2=$2"-r"; print; $2=substr($2,1,length($2)-2)"-a";print;next}$0~/^@/{print;next}($1 in keep)&&($7=="="||$7=="*"){$3=keep[$1];print}' $reads_to_homologs - | samtools sort -n -m 1G -O sam | awk '$0~/^@/{next}($1!=prev){if(n==2){sub("\t","",str); print str}; str=""; n=0}{for(i=12;i<=NF;i++){if($i~/^ip:i:/){$4=substr($i,6);break;}};str=str"\t"n"\t"$3"\t"$4"\t"n; n++; prev=$1}END{if(n==2){sub("\t","",str); print str}}' | sort -k 2,2 -S 6G + + } + export -f doit + echo $chr | tr "|" "\n" | parallel -j $threads --will-cite --joblog temp.log -k doit > diploid.mnd.txt + exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` + [ $exitval -eq 0 ] || { echo ":( Pipeline failed at building diploid contact maps. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } + rm temp.log + + + ## TODO: talk with Muhammad about adding VC norm + + # build hic file(s) + if [ "$separate_homologs" == "true" ]; then + { awk '$2~/-r$/{gsub("-r","",$2); gsub("-r","",$6); print}' diploid.mnd.txt > tmp1.mnd.txt && "${juicer_dir}"/scripts/juicer_tools pre -n "$resolutionsToBuildString" tmp1.mnd.txt "diploid_inter_r.hic" ${chrom_sizes}; "${juicer_dir}"/scripts/juicer_tools addNorm diploid_inter_r.hic -k VC,VC_SQRT; } #& + { awk '$2~/-a$/{gsub("-a","",$2); gsub("-a","",$6); print}' diploid.mnd.txt > tmp2.mnd.txt && "${juicer_dir}"/scripts/juicer_tools pre -n "$resolutionsToBuildString" tmp2.mnd.txt "diploid_inter_a.hic" ${chrom_sizes}; "${juicer_dir}"/scripts/juicer_tools addNorm diploid_inter_a.hic -k VC,VC_SQRT; } #& + #wait + rm tmp1.mnd.txt tmp2.mnd.txt + ## TODO: check if successful + else + "${juicer_dir}"/scripts/juicer_tools pre -n "$resolutionsToBuildString" diploid.mnd.txt "diploid_inter.hic" <(awk 'BEGIN{OFS="\t"}{print $1"-r", $2; print $1"-a", $2}' ${chrom_sizes}) + "${juicer_dir}"/scripts/juicer_tools addNorm diploid_inter.hic -k VC,VC_SQRT + ## TODO: check if successful + fi + + rm -f diploid.mnd.txt out.psf dangling.sam + + echo ":) Done building diploid contact maps from reads overlapping phased SNPs." >&1 + + [ "$last_stage" == "diploid_hic" ] && { echo "Done with the requested workflow. Exiting after building diploid contact maps!"; exit; } + first_stage="diploid_dhs" + +fi + +## V. BUILD DIPLOID ACCESSIBILITY TRACKS +if [ "$first_stage" == "diploid_dhs" ]; then + + echo "...Building diploid accessibility tracks from reads overlapping phased SNPs..." >&1 + + if [ ! -s reads.sorted.bam ] || [ ! -s reads.sorted.bam.bai ] || [ -z $reads_to_homologs ] || [ ! -s $reads_to_homologs ]; then + echo ":( Files from previous stages of the pipeline appear to be missing. Exiting!" | tee -a /dev/stderr + exit 1 + fi + + ## figure out platform + pl=`samtools view -H reads.sorted.bam | grep '^@RG' | sed "s/.*PL:\([^\t]*\).*/\1/g" | sed "s/ILM/ILLUMINA/g;s/Illumina/ILLUMINA/g;s/LS454/454/g" | uniq` + ([ "$pl" == "ILLUMINA" ] || [ "$pl" == "454" ]) || { echo ":( Platform name is not recognized or data from different platforms seems to be mixed. Can't handle this case. Exiting!" | tee -a /dev/stderr && exit 1; } + [ "$pl" == "ILLUMINA" ] && junction_rt_string="-d rt:2 -d rt:3 -d rt:4 -d rt:5" || junction_rt_string="-d rt:0 -d rt:1" + +## @SUHAS + export SHELL=$(type -p bash) + export junction_rt_string=${junction_rt_string} + export reads_to_homologs=${reads_to_homologs} + doit () { +samtools view -@2 ${junction_rt_string} -h reads.sorted.bam $1 | awk -v chr=$1 'BEGIN{ + OFS="\t"}FILENAME==ARGV[1]{ + if($2==chr"-r"||$2==chr"-a"){ + if(keep[$1]&&keep[$1]!=$2){ + delete keep[$1] + }else{ + keep[$1]=$2; + if ($3%2==0){ + keepRT[$1 " " $3+1]++; + } else{ + keepRT[$1 " " $3-1]++; + } + } + }; + next + } + $0~/^@/{next} + ($1 in keep){ + $3=keep[$1]; + for (i=12; i<=NF; i++) { + if ($i~/^ip/) { + split($i, ip, ":"); + } + else if ($i ~ /^rt:/) { + split($i, rt, ":"); + } + } + raw_locus[$3" "ip[3]]++ + if (keepRT[$1" "rt[3]]!="") { + locus[$3" "ip[3]]++; + } + }END{ + for (i in raw_locus) { + split(i, a, " ") + print a[1], a[2]-1, a[2], raw_locus[i] + } + for (i in locus) { + split(i, a, " "); + print a[1], a[2]-1, a[2], locus[i] > "/dev/stderr" + } + }' ${reads_to_homologs} - +} + + export -f doit + awk '{print $1}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit >tmp_raw.bedgraph 2>tmp_corrected.bedgraph + + exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` + [ $exitval -eq 0 ] || { echo ":( Pipeline failed at building diploid contact maps. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } + rm temp.log + + sort -k1,1 -k2,2n -S6G --parallel=${treads} tmp_raw.bedgraph > tmp_raw.bedgraph.sorted && mv tmp_raw.bedgraph.sorted tmp_raw.bedgraph + sort -k1,1 -k2,2n -S6G --parallel=${treads} tmp_corrected.bedgraph > tmp_corrected.bedgraph.sorted && mv tmp_corrected.bedgraph.sorted tmp_corrected.bedgraph + + # build bw file(s) + if [ "$separate_homologs" == "true" ]; then + awk 'BEGIN{OFS="\t"}$1~/-r$/{$1=substr($1,1,length($1)-2); print}' tmp_raw.bedgraph > tmp1.bedgraph + bedGraphToBigWig tmp1.bedgraph ${chrom_sizes} diploid_inter_raw_r.bw && rm tmp1.bedgraph + awk 'BEGIN{OFS="\t"}$1~/-a$/{$1=substr($1,1,length($1)-2); print}' tmp_raw.bedgraph > tmp2.bedgraph + bedGraphToBigWig tmp2.bedgraph ${chrom_sizes} diploid_inter_raw_a.bw && rm tmp2.bedgraph + + awk 'BEGIN{OFS="\t"}$1~/-r$/{$1=substr($1,1,length($1)-2); print}' tmp_corrected.bedgraph > tmp1.bedgraph + bedGraphToBigWig tmp1.bedgraph ${chrom_sizes} diploid_inter_corrected_r.bw && rm tmp1.bedgraph + awk 'BEGIN{OFS="\t"}$1~/-a$/{$1=substr($1,1,length($1)-2); print}' tmp_corrected.bedgraph > tmp2.bedgraph + bedGraphToBigWig tmp2.bedgraph ${chrom_sizes} diploid_inter_corrected_a.bw && rm tmp2.bedgraph + ## TODO: check if successful + else + bedGraphToBigWig tmp_raw.bedgraph <(awk 'BEGIN{OFS="\t"}{print $1"-r", $2; print $1"-a", $2}' ${chrom_sizes}) diploid_inter_raw.bw + bedGraphToBigWig tmp_corrected.bedgraph <(awk 'BEGIN{OFS="\t"}{print $1"-r", $2; print $1"-a", $2}' ${chrom_sizes}) diploid_inter_corrected.bw + fi + + #rm tmp_raw.bedgraph tmp_corrected.bedgraph + + echo ":) Done building diploid accessibility tracks from reads overlapping phased SNPs." >&1 + + [ "$last_stage" == "diploid_dhs" ] && { echo "Done with the requested workflow. Exiting after building diploid accessibility tracks!"; exit; } + first_stage="cleanup" + +fi + +# ## IX. CLEANUP +# echo "...Starting cleanup..." >&1 +# #rm reads.sorted.bam reads.sorted.bam.bai +# #rm reads_to_homologs.txt +# echo ":) Done with cleanup. This is the last stage of the pipeline. Exiting!" +# exit + +} From 7122b9cde728054b173370cb55bcf1eb4a46a6e3 Mon Sep 17 00:00:00 2001 From: dudcha Date: Thu, 20 Jan 2022 14:59:20 -0600 Subject: [PATCH 06/10] First path for megafy script --- CPU/megafy.sh | 476 ++++++++++---------------------------------------- 1 file changed, 94 insertions(+), 382 deletions(-) diff --git a/CPU/megafy.sh b/CPU/megafy.sh index 73b93b9..58526e4 100755 --- a/CPU/megafy.sh +++ b/CPU/megafy.sh @@ -1,12 +1,11 @@ #!/bin/bash { -#### Description: Wrapper script to calculate a "mega" hic file from a set of bams, with optional diploid portion. -#### Usage: bash ./mega.sh [-v|--vcf ] ... . +#### Description: Wrapper script to calculate a "mega" hic file and accessibility track from a set of bams. +#### Usage: bash ./megafy.sh -c|--chrom-sizes ... . #### Input: list of merged_dedup.bam files corresponding to individual Hi-C experiments. -#### Output: "mega" hic map and "mega" chromatin accessibility bw file. -#### Optional input: phased vcf file. When passed launches generation of diploid versions of output. -#### Dependencies: Java, Samtools, GNU Parallel, KentUtils, Juicer, 3D-DNA (for diploid portion only). -#### Written by: +#### Output: "mega" hic map, "mega" chromatin accessibility bw file. +#### Dependencies: Java, Samtools, GNU Parallel, KentUtils, Juicer. +#### Written by: OD echo "*****************************************************" >&1 echo "cmd log: "$0" "$* >&1 @@ -14,12 +13,12 @@ echo "*****************************************************" >&1 USAGE=" ***************************************************** -Simplified mega script for ENCODE DCC hic-pipeline. +Optimized mega script for ENCODE DCC hic-pipeline. -USAGE: ./mega.sh -c|--chrom-sizes [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf ] [-C|--exclude-chr-from-diploid chr_list] [--separate-homologs] [-p|--psf ] [--reads-to-homologs ] [--juicer-dir ] [--phaser-dir ] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--shortcut-stats-1 inter.hic_1,...inter.hic_N] [--shortcut-stats-30 inter_30.hic_1,...inter_30.hic_N] [--from-stage ] [--to-stage ] ... +USAGE: ./megafy.sh -c|--chrom-sizes [-g|--genome-id genome_id] [-q|--mapq mapq] [-r|--resolutions resolutions_string] [--juicer-dir ] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--shortcut-stats-1 inter.hic_1,...inter.hic_N] [--shortcut-stats-30 inter_30.hic_1,...inter_30.hic_N] [--from-stage ] [--to-stage ] ... DESCRIPTION: -This is a simplied mega.sh script to produce aggregate Hi-C maps and chromatic accessibility tracks from multiple experiments. The pipeline includes optional diploid steps which produce diploid versions of the contact maps and chromatin accessibility tracks. +This is an optimized mega.sh script to produce aggregate Hi-C maps and chromatic accessibility tracks from multiple experiments. ARGUMENTS: path_to_merged_dedup_bam @@ -29,38 +28,25 @@ OPTIONS: -h|--help Shows this help. -HAPLOID PORTION --g|--genome-id [genome_id] - Genome id, e.g. hg38, for some of the common references used by Juicer. Used to run the motif finder. +ACTUAL WORK: + +-q|--mapq [mapq] + Mapping quality threshold to be used. Deafult: 30. -c|--chrom-sizes [path_to_chrom_sizes_file] Path to chrom.sizes file for the reference used when processing the individual Hi-C experiments. +-g|--genome-id [genome_id] + Genome id, e.g. hg38, for some of the common references used by Juicer. Used to run the motif finder. + -r|--resolutions [string] Comma-separated resolutions at which to build the hic files. Default: 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10. ---shortcut-stats-1 [inter.hic_1,...inter.hic_N] - Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=1. - ---shortcut-stats-30 [inter_30.hic_1,...inter_30.hic_N] - Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=30. - -DIPLOID PORTION: --v|--vcf [path_to_vcf] - Path to a Variant Call Format (vcf) file containing phased sequence variation data, e.g. as generated by the ENCODE DCC Hi-C variant calling & phasing pipeline. Passing a vcf file invokes a diploid section of the script. - --C|--exclude-chr-from-diploid [chr_list] - Remove specific molecules from the chromosome list (default: chrY). Note that -c and -C are incompatible in that if both are invoked, only the last listed option will be taken into account. Deafult: chrY. - ---separate-homologs - Build two separate contact maps & two separate accessibility tracks for homologs as opposed to a single diploid contact map with interleaved homologous chromosomes. This is the preferred mode for comparing contacts across homologs using the \"Observed over Control\" view for map comparison in Juicebox. Importantly, assignment of chromosomes to homologs is arbitrary. Default: not invoked. --p|--psf [path_to_psf] - Path to a 3D-DNA phaser psf file containing heterozygous variant and phasing information. Optional input to fast-forward some steps & save compute on processing the vcf file. - ---reads-to-homologs [path_to_reads_to_homologs_file] - Path to a reads_to_homologs file generated by the phasing pipeline. Optional input to fast-forward some steps & save compute on diploid processing (assumes the input bams were used for phasing). +--shortcut-stats [inter_30.hic_1,...inter_30.hic_N] + Comma-separated list of hic files to use to quickly calculate the mega stats file. Make sure the files used were generated at the same mapping quality as expected of the mega-map (default: -q 30). WORKFLOW CONTROL: + -t|--threads [num] Indicate how many threads to use. Default: half of available cores as calculated by parallel --number-of-cores. @@ -70,24 +56,19 @@ WORKFLOW CONTROL: --juicer-dir [path_to_juicer_dir] Path to Juicer directory, contains scripts/, references/, and restriction_sites/ ---phaser-dir [path_to_3ddna_dir] - Path to 3D-DNA directory, contains phase/ - --from-stage [pipeline_stage] - Fast-forward to a particular stage of the pipeline. The pipeline_stage argument can be \"prep\", \"hic\", \"hicnarrow\", \"dhs\", \"diploid_hic\", \"diploid_dhs\", \"cleanup\". + Fast-forward to a particular stage of the pipeline. The pipeline_stage argument can be \"prep\", \"hic\", \"annotations\", \"dhs\", \"cleanup\". --to-stage [pipeline_stage] - Exit after a particular stage of the pipeline. The argument can be \"prep\", \"hic\", \"hicnarrow\", \"dhs\", \"diploid_hic\", \"diploid_dhs\", \"cleanup\". + Exit after a particular stage of the pipeline. The argument can be \"prep\", \"hic\", \"annotations\", \"dhs\", \"cleanup\". ***************************************************** " # defaults: -resolutionsToBuildString="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10" +mapq=30 +resolutionsToBuildString="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10,1" genome_id="hg38" -exclude_chr="Y|chrY|MT|chrM" -separate_homologs=false -mapq=1 ##lowest mapq of interest # multithreading threads=`parallel --number-of-cores` @@ -112,11 +93,9 @@ last_stage="cleanup" declare -A stage stage[prep]=0 stage[hic]=1 -stage[hicnarrow]=2 +stage[annotations]=2 stage[dhs]=3 -stage[diploid_hic]=4 -stage[diploid_dhs]=5 -stage[cleanup]=6 +stage[cleanup]=4 ############### HANDLE OPTIONS ############### @@ -126,11 +105,7 @@ while :; do echo "$USAGE" >&1 exit 0 ;; -## HAPLOID PORTION - -g|--genome-id) OPTARG=$2 - genome_id=$OPTARG - shift - ;; +## OUTPUT -c|--chrom-sizes) OPTARG=$2 if [ -s $OPTARG ] && [[ $OPTARG == *.chrom.sizes ]]; then echo "... -c|--chrom-sizes flag was triggered with $OPTARG value." >&1 @@ -141,58 +116,28 @@ while :; do fi shift ;; - -r|--resolutions) OPTARG=$2 - resolutionsToBuildString="-r "$OPTARG - shift - ;; - --shortcut-stats-1) OPTARG=$2 - echo "... --shortcut-stats-1 flag was triggered with $OPTARG value." >&1 - shortcut_stats_1=$OPTARG - shift - ;; - --shortcut-stats-30) OPTARG=$2 - echo "... --shortcut-stats-30 flag was triggered with $OPTARG value." >&1 - shortcut_stats_30=$OPTARG + -g|--genome-id) OPTARG=$2 + genome_id=$OPTARG shift ;; -## DIPLOID PORTION - -v|--vcf) OPTARG=$2 - if [ -s $OPTARG ] && [[ $OPTARG == *.vcf ]]; then - echo "... -v|--vcf flag was triggered, will try to generate diploid versions of the hic file and accessibility track based on phasing data in $OPTARG." >&1 - vcf=$OPTARG - else - echo " :( Vcf file is not found at expected location, is empty or does not have the expected extension. Exiting!" >&2 + -q|--mapq) OPTARG=$2 + re='^[0-9]+$' + if [[ $OPTARG =~ $re ]]; then + echo "... -q|--mapq flag was triggered, will use $OPTARG as mapping quality threshold." >&1 + mapq=$OPTARG + else + echo " :( Wrong syntax for mapping quality threshold parameter value. Exiting!" >&2 exit 1 - fi - shift - ;; - -C|--exclude-chr-from-diploid) OPTARG=$2 - echo "... -C|--exclude-chr-from-diploid flag was triggered, will ignore variants on $OPTARG." >&1 - exclude_chr=$OPTARG + fi shift - ;; - --separate-homologs) - echo "... --separate-homologs flag was triggered, will build two separate contact maps and two separate accessibility tracks (_r.hic/_a.hic and _r.bw/_a.bw) for chromosomal homologs with identical chromosomal labels." >&1 - separate_homologs=true - ;; - -p|--psf) OPTARG=$2 - if [ -s $OPTARG ] && [[ $OPTARG == *.psf ]]; then - echo "... -p|--psf flag was triggered, will try to generate diploid versions of the hic file and accessibility track based on phasing data in $OPTARG." >&1 - psf=$OPTARG - else - echo " :( Psf file is not found at expected location, is empty or does not have the expected extension. Exiting!" >&2 - exit 1 - fi + ;; + -r|--resolutions) OPTARG=$2 + resolutionsToBuildString="-r "$OPTARG shift ;; - --reads-to-homologs) OPTARG=$2 - if [ -s $OPTARG ] && [[ $OPTARG == *.txt ]]; then - echo "... --reads-to-homologs flag was triggered, will try to generate diploid versions of the hic file and accessibility track based on reads-to-homolog data in $OPTARG." >&1 - reads_to_homologs=$OPTARG - else - echo " :( File is not found at expected location, is empty or does not have the expected extension. Exiting!" >&2 - exit 1 - fi + --shortcut-stats) OPTARG=$2 + echo "... --shortcut-stats flag was triggered with $OPTARG value." >&1 + shortcut_stats=$OPTARG shift ;; ## WORKFLOW @@ -227,33 +172,23 @@ while :; do echo " :( Juicer folder not found at expected location. Exiting!" >&2 fi shift - ;; - --phaser-dir) OPTARG=$2 - if [ -d $OPTARG ]; then - echo "... --phaser-dir flag was triggered with $OPTARG." >&1 - phaser_dir=$OPTARG - else - exit 1 - echo " :( Juicer folder not found at expected location. Exiting!" >&2 - fi - shift ;; --from-stage) OPTARG=$2 - if [ "$OPTARG" == "prep" ] || [ "$OPTARG" == "hic" ] || [ "$OPTARG" == "hicnarrow" ] || [ "$OPTARG" == "dhs" ] || [ "$OPTARG" == "diploid_hic" ] || [ "$OPTARG" == "diploid_dhs" ] || [ "$OPTARG" == "cleanup" ]; then + if [ "$OPTARG" == "prep" ] || [ "$OPTARG" == "hic" ] || [ "$OPTARG" == "annotations" ] || [ "$OPTARG" == "dhs" ] || [ "$OPTARG" == "cleanup" ]; then echo "... --from-stage flag was triggered. Will fast-forward to $OPTARG." >&1 first_stage=$OPTARG else - echo " :( Whong syntax for pipeline stage. Please use prep/hic/hicnarrow/dhs/diploid_hic/diploid_dhs/cleanup. Exiting!" >&2 + echo " :( Whong syntax for pipeline stage. Please use prep/hic/annotations/dhs/cleanup. Exiting!" >&2 exit 1 fi shift ;; --to-stage) OPTARG=$2 - if [ "$OPTARG" == "prep" ] || [ "$OPTARG" == "hic" ] || [ "$OPTARG" == "hicnarrow" ] || [ "$OPTARG" == "dhs" ] || [ "$OPTARG" == "diploid_hic" ] || [ "$OPTARG" == "diploid_dhs" ] || [ "$OPTARG" == "cleanup" ]; then + if [ "$OPTARG" == "prep" ] || [ "$OPTARG" == "hic" ] || [ "$OPTARG" == "annotations" ] || [ "$OPTARG" == "dhs" ] || [ "$OPTARG" == "cleanup" ]; then echo "... --to-stage flag was triggered. Will exit after $OPTARG." >&1 last_stage=$OPTARG else - echo " :( Whong syntax for pipeline stage. Please use prep/hic/hicnarrow/dhs/diploid_hic/diploid_dhs/cleanup. Exiting!" >&2 + echo " :( Whong syntax for pipeline stage. Please use prep/hic/annotations/dhs/cleanup. Exiting!" >&2 exit 1 fi shift @@ -288,7 +223,6 @@ fi ## Juicer & Phaser [ -z $juicer_dir ] && { echo >&2 ":( Juicer directory is not specified. Exiting!"; exit 1; } -([ ! -z $vcf ] && [ -z $phaser_dir ]) && { echo >&2 ":( Phaser directory is not specified. Exiting!"; exit 1; } ## Java Dependency type java >/dev/null 2>&1 || { echo >&2 ":( Java is not available, please install/add to path Java. Exiting!"; exit 1; } @@ -311,7 +245,7 @@ bam=`echo "${@:1}"` ##TODO: check file extentions ############### MAIN ################# -## 0. PREP BAM FILE +## 0. PREP BAM FILE: {bam}->reads.sorted.bam & reads.sorted.bam.bai if [ "$first_stage" == "prep" ]; then @@ -338,7 +272,10 @@ if [ "$first_stage" == "prep" ]; then fi -## I. HIC +## I. HIC: +### (a) reads.sorted.bam & reads.sorted.bam.bai -> merged${mapq}.txt +### (b) $shortcut_stats -> inter_${mapq}.txt & inter_${mapq}_hists.m +### (c) merged${mapq}.txt, inter_${mapq}.txt & inter_${mapq}_hists.m -> inter_$mapq.hic if [ "$first_stage" == "hic" ]; then @@ -346,6 +283,8 @@ if [ "$first_stage" == "hic" ]; then ([ -f reads.sorted.bam ] && [ -f reads.sorted.bam.bai ]) || { echo ":( Files from previous stages of the pipeline appear to be missing. Exiting!" | tee -a /dev/stderr; exit 1; } +### (a) Generate mnd-type input for pre: reads.sorted.bam & reads.sorted.bam.bai -> "merged"${mapq}".txt" + export SHELL=$(type -p bash) doit () { mapq=$2 @@ -354,119 +293,82 @@ if [ "$first_stage" == "hic" ]; then export -f doit - ## opt1 - ## extra variable contains all (small) sequences that are not already in the chrom.sizes file. There is no use in them for generating the hic file, they are generated only for consistensy (and potentially stats) + ### extra variable contains all (small) sequences that are not already in the chrom.sizes file. There is no use in them for generating the hic file, they are generated only for consistensy (and potentially stats) extra=`samtools view -H reads.sorted.bam | grep '^@SQ' | sed "s/.*SN:\([^\t]*\).*/\1/g" | awk 'FILENAME==ARGV[1]{drop[$1]=1;next}!($1 in drop)' ${chrom_sizes} - | xargs` - #merged1.txt - awk -v extra="$extra" '{print $1}END{print extra}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit {} 1 >merged1.txt 2>merged1.tmp.txt + awk -v extra="$extra" '{print $1}END{print extra}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit {} $mapq >"merged"${mapq}".txt" 2>"merged"${mapq}".tmp.txt" exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` [ $exitval -eq 0 ] || { echo ":( Pipeline failed at generating the mega mnd file. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } rm temp.log - sort -k2,2 -k6,6 -S 6G --parallel=${threads} merged1.tmp.txt >> merged1.txt && rm merged1.tmp.txt + sort -k2,2 -k6,6 -S 6G --parallel=${threads} "merged"${mapq}".tmp.txt" >> "merged"${mapq}".txt" && rm "merged"${mapq}".tmp.txt" - #merged30.txt - ## @MUHAMMAD: why do we have two now rather than running pre from the same one? - awk -v extra="$extra" '{print $1}END{print extra}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit {} 30 >merged30.txt 2>merged30.tmp.txt - exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` - [ $exitval -eq 0 ] || { echo ":( Pipeline failed at generating the mega mnd file. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } - rm temp.log - sort -k2,2 -k6,6 -S 6G --parallel=${threads} merged30.tmp.txt >> merged30.txt && rm merged30.tmp.txt - - ## opt2 - # awk -v extra=$extra '{print $1}END{print extra}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log "doit {} 1 >out.{#}.txt 2>err.{#}.txt" - # exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` - # [ $exitval -eq 0 ] || { echo ":( Pipeline failed at building diploid contact maps. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } - # awk '{print NR}END{print NR+1}' $chrom_sizes | parallel --will-cite -k "cat out.{}.txt" >merged1.txt - # awk '{print NR}END{print NR+1}' $chrom_sizes | parallel --will-cite -k "sort -k6,6 -S6G err.{}.txt" >>merged1.txt - # awk '{print NR}END{print NR+1}' $chrom_sizes | parallel --will-cite "rm out.{}.txt err.{}.txt" - # rm temp.log - - touch inter.txt inter_hists.m - if [ ! -z ${shortcut_stats_1} ]; then - tmpstr="" - IFS=',' read -ra STATS <<< "$shortcut_stats_1" - for i in "${STATS[@]}"; do - [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; } - done - [ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter ${tmpstr:1} - fi - - if [[ $threadsHic -gt 1 ]] && [[ ! -s merged1_index.txt ]] - then - "${juicer_dir}"/scripts/index_by_chr.awk merged1.txt 500000 > merged1_index.txt - tempdirPre="HIC_tmp" && mkdir $tempdirPre - threadHicString="--threads $threadsHic -i merged1_index.txt -t ${tempdirPre}" - else - threadHicString="" - fi +### (b) recreate stats: $shortcut_stats -> inter_${mapq}.txt & inter_${mapq}_hists.m - touch inter_30.txt inter_30_hists.m - if [ ! -z ${shortcut_stats_30} ]; then + touch "inter_"${mapq}".txt" "inter_"${mapq}"_hists.m" + if [ ! -z ${shortcut_stats} ]; then tmpstr="" - IFS=',' read -ra STATS <<< "$shortcut_stats_30" + IFS=',' read -ra STATS <<< "$shortcut_stats" for i in "${STATS[@]}"; do [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; } done - [ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter_30 ${tmpstr:1} + [ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar "inter_"${mapq} ${tmpstr:1} fi - if [[ $threadsHic -gt 1 ]] && [[ ! -s merged30_index.txt ]] +### (c) Run pre: merged${mapq}.txt, inter_${mapq}.txt & inter_${mapq}_hists.m -> inter_$mapq.hic + + ## prep + if [[ $threadsHic -gt 1 ]] then - "${juicer_dir}"/scripts/index_by_chr.awk merged30.txt 500000 > merged30_index.txt - tempdirPre30="HIC30_tmp" && mkdir "${tempdirPre30}" - threadHic30String="--threads $threadsHic -i merged30_index.txt -t ${tempdirPre30}" + ## Same indexing parameter 500000? + [[ ! -s "merged"${mapq}"_index.txt" ]] && "${juicer_dir}"/scripts/index_by_chr.awk "merged"${mapq}".txt" 500000 > "merged"${mapq}"_index.txt" + tempdirPre="HIC"${mapq}"_tmp" && mkdir "${tempdirPre}" + threadHicString="--threads $threadsHic -i merged${mapq}_index.txt -t ${tempdirPre}" else - threadHic30String="" + threadHicString="" fi - ##@MUHAMMAD: - export IBM_JAVA_OPTIONS="-Xmx50000m -Xgcthreads1" + ## resources TBD for actual mega-jobs: + export IBM_JAVA_OPTIONS="-Xmx50000m -Xgcthreads24" export _JAVA_OPTIONS="-Xmx50000m -Xms50000m" - ##@MUHAMMAD: - "${juicer_dir}"/scripts/juicer_tools pre -n -s inter.txt -g inter_hists.m -q 1 "$resolutionsToBuildString" "$threadHicString" merged1.txt inter.hic "$chrom_sizes" - "${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic inter.hic - rm -Rf "${tempdirPre}" - ## TODO: check for failure - - "${juicer_dir}"/scripts/juicer_tools pre -n -s inter_30.txt -g inter_30_hists.m -q 30 "$resolutionsToBuildString" "$threadHic30String" merged30.txt inter_30.hic "$chrom_sizes" - "${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic inter_30.hic - rm -Rf "${tempdirPre30}" + ## actually run pre. Passing -q does not do anything, for record. + "${juicer_dir}"/scripts/juicer_tools pre -n -s inter_${mapq}.txt -g inter_${mapq}_hists.m -q $mapq "$resolutionsToBuildString" "$threadHicString" merged${mapq}.txt inter_${mapq}.hic "$chrom_sizes" + "${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic inter_${mapq}.hic + rm -Rf "${tempdirPre}" ## TODO: check for failure echo ":) Done building mega hic files." >&1 [ "$last_stage" == "hic" ] && { echo "Done with the requested workflow. Exiting after generating the mega.hic file!"; exit; } - first_stage="hicnarrow" + first_stage="annotations" fi -## II. HICNARROW. -if [ "$first_stage" == "hicnarrow" ]; then +## II. ANNOTATION PACKAGE. ONLY ADDING HICCUPS AND ARROWHEAD AS EXAMPLE: inter${mapq}.hic -> annotations +if [ "$first_stage" == "annotations" ]; then echo "...Annotating loops and domains..." >&1 - [ -f inter_30.hic ] || { echo ":( Files from previous stages of the pipeline appear to be missing. Exiting!" | tee -a /dev/stderr; exit 1; } + [ -f inter_${mapq}.hic ] || { echo ":( Files from previous stages of the pipeline appear to be missing. Exiting!" | tee -a /dev/stderr; exit 1; } [ -z $genome_id ] && { echo ":( This step requires a genome ID. Please provide (e.g. \"-g hg38\") or skip this step. Exiting!" | tee -a /dev/stderr; exit 1; } - # Create loop lists file for MQ > 30 - "${juicer_dir}"/scripts/juicer_hiccups.sh -j "${juicer_dir}"/scripts/juicer_tools -i inter_30.hic -m "${juicer_dir}"/references/motif -g "$genome_id" + # Create loop lists file + "${juicer_dir}"/scripts/juicer_hiccups.sh -j "${juicer_dir}"/scripts/juicer_tools -i inter_${mapq}.hic -m "${juicer_dir}"/references/motif -g "$genome_id" ##TODO: check for failure - "${juicer_dir}"/scripts/juicer_arrowhead.sh -j "${juicer_dir}"/scripts/juicer_tools -i inter_30.hic + "${juicer_dir}"/scripts/juicer_arrowhead.sh -j "${juicer_dir}"/scripts/juicer_tools -i inter_${mapq}.hic ##TODO: check for failure echo ":) Done annotating loops and domains." >&1 - [ "$last_stage" == "hicnarrow" ] && { echo "Done with the requested workflow. Exiting after generating loop and domain annotations!"; exit; } + [ "$last_stage" == "annotations" ] && { echo "Done with the requested workflow. Exiting after generating loop and domain annotations!"; exit; } first_stage="dhs" fi -## III. BUILD HAPLOID ACCESSIBILITY TRACKS +## III. BUILD ACCESSIBILITY TRACKS: reads.sorted.bam & reads.sorted.bam.bai -> inter_${mapq}.bw if [ "$first_stage" == "dhs" ]; then echo "...Building accessibility tracks..." >&1 @@ -486,215 +388,25 @@ if [ "$first_stage" == "dhs" ]; then } export -f doit - # mapq1 accessibility track - awk '{print $1}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit {} 1 > tmp.bedgraph - exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` - [ $exitval -eq 0 ] || { echo ":( Pipeline failed at building diploid contact maps. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } - rm temp.log - bedGraphToBigWig tmp.bedgraph $chrom_sizes inter.bw - rm tmp.bedgraph - # mapq30 accessibility track - awk '{print $1}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit {} 30 > tmp.bedgraph + awk '{print $1}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit {} $mapq > tmp.bedgraph exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` [ $exitval -eq 0 ] || { echo ":( Pipeline failed at building diploid contact maps. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } rm temp.log - bedGraphToBigWig tmp.bedgraph $chrom_sizes inter_30.bw + bedGraphToBigWig tmp.bedgraph $chrom_sizes inter_${mapq}.bw rm tmp.bedgraph echo ":) Done building accessibility tracks." >&1 - [ "$last_stage" == "dhs" ] && { echo "Done with the requested workflow. Exiting after building haploid accessibility tracks!"; exit; } - ([ -z $vcf ] && [ -z $psf ] && [ -z ${reads_to_homologs} ]) && first_stage="cleanup" || first_stage="diploid_hic" - -fi - -## IV. BUILD DIPLOID CONTACT MAPS -if [ "$first_stage" == "diploid_hic" ]; then - - echo "...Building diploid contact maps from reads overlapping phased SNPs..." >&1 - - if [ ! -s reads.sorted.bam ] || [ ! -s reads.sorted.bam.bai ] ; then - echo ":( Files from previous stages of the pipeline appear to be missing. Exiting!" | tee -a /dev/stderr - exit 1 - fi - ## @PAUL: which chrom.sizes file is used? Maybe it's fine..? - - chr=`awk -v exclude_chr=${exclude_chr} 'BEGIN{split(exclude_chr,a,"|"); for(i in a){ignore[a[i]]=1}}!($1 in ignore){str=str"|"$1}END{print substr(str,2)}' ${chrom_sizes}` - - if [ -z $reads_to_homologs ]; then - - if [ -z $psf ]; then - echo " ... Parsing vcf..." - awk -v chr=${chr} -v output_prefix="out" -f ${phaser_dir}/phase/vcf-to-psf-and-assembly.awk ${vcf} - echo " ... :) Done parsing vcf!" - psf=out.psf - rm out.assembly - fi - - echo " ... Extracting reads overlapping SNPs..." - export SHELL=$(type -p bash) - export psf=${psf} - export pipeline=${phaser_dir} - doit () { - samtools view -@ 2 reads.sorted.bam $1 | awk -f ${pipeline}/phase/extract-SNP-reads-from-sam-file.awk ${psf} - - } - export -f doit - echo $chr | tr "|" "\n" | parallel -j $threads --will-cite --joblog temp.log doit > dangling.sam - exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` - [ $exitval -eq 0 ] || { echo ":( Pipeline failed at parsing bam. Check stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } - rm temp.log - - bash ${phaser_dir}/phase/assign-reads-to-homologs.sh -t ${threads} -c ${chr} $psf dangling.sam - reads_to_homologs=reads_to_homologs.txt - echo " ... :) Done extracting reads overlapping SNPs!" - - fi - - # build mnd file: can do without sort -n, repeat what was done in hic stage. - export SHELL=$(type -p bash) - export psf=${psf} - export pipeline=${phaser_dir} - export reads_to_homologs=$reads_to_homologs - doit () { - samtools view -@ 2 -h reads.sorted.bam $1 | awk -v chr=$1 'BEGIN{OFS="\t"}FILENAME==ARGV[1]{if($2==chr"-r"||$2==chr"-a"){if(keep[$1]&&keep[$1]!=$2){delete keep[$1]}else{keep[$1]=$2}};next}$0~/^@SQ/{$2=$2"-r"; print; $2=substr($2,1,length($2)-2)"-a";print;next}$0~/^@/{print;next}($1 in keep)&&($7=="="||$7=="*"){$3=keep[$1];print}' $reads_to_homologs - | samtools sort -n -m 1G -O sam | awk '$0~/^@/{next}($1!=prev){if(n==2){sub("\t","",str); print str}; str=""; n=0}{for(i=12;i<=NF;i++){if($i~/^ip:i:/){$4=substr($i,6);break;}};str=str"\t"n"\t"$3"\t"$4"\t"n; n++; prev=$1}END{if(n==2){sub("\t","",str); print str}}' | sort -k 2,2 -S 6G - - } - export -f doit - echo $chr | tr "|" "\n" | parallel -j $threads --will-cite --joblog temp.log -k doit > diploid.mnd.txt - exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` - [ $exitval -eq 0 ] || { echo ":( Pipeline failed at building diploid contact maps. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } - rm temp.log - - - ## TODO: talk with Muhammad about adding VC norm - - # build hic file(s) - if [ "$separate_homologs" == "true" ]; then - { awk '$2~/-r$/{gsub("-r","",$2); gsub("-r","",$6); print}' diploid.mnd.txt > tmp1.mnd.txt && "${juicer_dir}"/scripts/juicer_tools pre -n "$resolutionsToBuildString" tmp1.mnd.txt "diploid_inter_r.hic" ${chrom_sizes}; "${juicer_dir}"/scripts/juicer_tools addNorm diploid_inter_r.hic -k VC,VC_SQRT; } #& - { awk '$2~/-a$/{gsub("-a","",$2); gsub("-a","",$6); print}' diploid.mnd.txt > tmp2.mnd.txt && "${juicer_dir}"/scripts/juicer_tools pre -n "$resolutionsToBuildString" tmp2.mnd.txt "diploid_inter_a.hic" ${chrom_sizes}; "${juicer_dir}"/scripts/juicer_tools addNorm diploid_inter_a.hic -k VC,VC_SQRT; } #& - #wait - rm tmp1.mnd.txt tmp2.mnd.txt - ## TODO: check if successful - else - "${juicer_dir}"/scripts/juicer_tools pre -n "$resolutionsToBuildString" diploid.mnd.txt "diploid_inter.hic" <(awk 'BEGIN{OFS="\t"}{print $1"-r", $2; print $1"-a", $2}' ${chrom_sizes}) - "${juicer_dir}"/scripts/juicer_tools addNorm diploid_inter.hic -k VC,VC_SQRT - ## TODO: check if successful - fi - - rm -f diploid.mnd.txt out.psf dangling.sam - - echo ":) Done building diploid contact maps from reads overlapping phased SNPs." >&1 - - [ "$last_stage" == "diploid_hic" ] && { echo "Done with the requested workflow. Exiting after building diploid contact maps!"; exit; } - first_stage="diploid_dhs" - -fi - -## V. BUILD DIPLOID ACCESSIBILITY TRACKS -if [ "$first_stage" == "diploid_dhs" ]; then - - echo "...Building diploid accessibility tracks from reads overlapping phased SNPs..." >&1 - - if [ ! -s reads.sorted.bam ] || [ ! -s reads.sorted.bam.bai ] || [ -z $reads_to_homologs ] || [ ! -s $reads_to_homologs ]; then - echo ":( Files from previous stages of the pipeline appear to be missing. Exiting!" | tee -a /dev/stderr - exit 1 - fi - - ## figure out platform - pl=`samtools view -H reads.sorted.bam | grep '^@RG' | sed "s/.*PL:\([^\t]*\).*/\1/g" | sed "s/ILM/ILLUMINA/g;s/Illumina/ILLUMINA/g;s/LS454/454/g" | uniq` - ([ "$pl" == "ILLUMINA" ] || [ "$pl" == "454" ]) || { echo ":( Platform name is not recognized or data from different platforms seems to be mixed. Can't handle this case. Exiting!" | tee -a /dev/stderr && exit 1; } - [ "$pl" == "ILLUMINA" ] && junction_rt_string="-d rt:2 -d rt:3 -d rt:4 -d rt:5" || junction_rt_string="-d rt:0 -d rt:1" - -## @SUHAS - export SHELL=$(type -p bash) - export junction_rt_string=${junction_rt_string} - export reads_to_homologs=${reads_to_homologs} - doit () { -samtools view -@2 ${junction_rt_string} -h reads.sorted.bam $1 | awk -v chr=$1 'BEGIN{ - OFS="\t"}FILENAME==ARGV[1]{ - if($2==chr"-r"||$2==chr"-a"){ - if(keep[$1]&&keep[$1]!=$2){ - delete keep[$1] - }else{ - keep[$1]=$2; - if ($3%2==0){ - keepRT[$1 " " $3+1]++; - } else{ - keepRT[$1 " " $3-1]++; - } - } - }; - next - } - $0~/^@/{next} - ($1 in keep){ - $3=keep[$1]; - for (i=12; i<=NF; i++) { - if ($i~/^ip/) { - split($i, ip, ":"); - } - else if ($i ~ /^rt:/) { - split($i, rt, ":"); - } - } - raw_locus[$3" "ip[3]]++ - if (keepRT[$1" "rt[3]]!="") { - locus[$3" "ip[3]]++; - } - }END{ - for (i in raw_locus) { - split(i, a, " ") - print a[1], a[2]-1, a[2], raw_locus[i] - } - for (i in locus) { - split(i, a, " "); - print a[1], a[2]-1, a[2], locus[i] > "/dev/stderr" - } - }' ${reads_to_homologs} - -} - - export -f doit - awk '{print $1}' $chrom_sizes | parallel -j $threads --will-cite --joblog temp.log -k doit >tmp_raw.bedgraph 2>tmp_corrected.bedgraph - - exitval=`awk 'NR>1{if($7!=0){c=1; exit}}END{print c+0}' temp.log` - [ $exitval -eq 0 ] || { echo ":( Pipeline failed at building diploid contact maps. See stderr for more info. Exiting! " | tee -a /dev/stderr && exit 1; } - rm temp.log - - sort -k1,1 -k2,2n -S6G --parallel=${treads} tmp_raw.bedgraph > tmp_raw.bedgraph.sorted && mv tmp_raw.bedgraph.sorted tmp_raw.bedgraph - sort -k1,1 -k2,2n -S6G --parallel=${treads} tmp_corrected.bedgraph > tmp_corrected.bedgraph.sorted && mv tmp_corrected.bedgraph.sorted tmp_corrected.bedgraph - - # build bw file(s) - if [ "$separate_homologs" == "true" ]; then - awk 'BEGIN{OFS="\t"}$1~/-r$/{$1=substr($1,1,length($1)-2); print}' tmp_raw.bedgraph > tmp1.bedgraph - bedGraphToBigWig tmp1.bedgraph ${chrom_sizes} diploid_inter_raw_r.bw && rm tmp1.bedgraph - awk 'BEGIN{OFS="\t"}$1~/-a$/{$1=substr($1,1,length($1)-2); print}' tmp_raw.bedgraph > tmp2.bedgraph - bedGraphToBigWig tmp2.bedgraph ${chrom_sizes} diploid_inter_raw_a.bw && rm tmp2.bedgraph - - awk 'BEGIN{OFS="\t"}$1~/-r$/{$1=substr($1,1,length($1)-2); print}' tmp_corrected.bedgraph > tmp1.bedgraph - bedGraphToBigWig tmp1.bedgraph ${chrom_sizes} diploid_inter_corrected_r.bw && rm tmp1.bedgraph - awk 'BEGIN{OFS="\t"}$1~/-a$/{$1=substr($1,1,length($1)-2); print}' tmp_corrected.bedgraph > tmp2.bedgraph - bedGraphToBigWig tmp2.bedgraph ${chrom_sizes} diploid_inter_corrected_a.bw && rm tmp2.bedgraph - ## TODO: check if successful - else - bedGraphToBigWig tmp_raw.bedgraph <(awk 'BEGIN{OFS="\t"}{print $1"-r", $2; print $1"-a", $2}' ${chrom_sizes}) diploid_inter_raw.bw - bedGraphToBigWig tmp_corrected.bedgraph <(awk 'BEGIN{OFS="\t"}{print $1"-r", $2; print $1"-a", $2}' ${chrom_sizes}) diploid_inter_corrected.bw - fi - - #rm tmp_raw.bedgraph tmp_corrected.bedgraph - - echo ":) Done building diploid accessibility tracks from reads overlapping phased SNPs." >&1 - - [ "$last_stage" == "diploid_dhs" ] && { echo "Done with the requested workflow. Exiting after building diploid accessibility tracks!"; exit; } - first_stage="cleanup" - + [ "$last_stage" == "dhs" ] && { echo "Done with the requested workflow. Exiting after building accessibility tracks!"; exit; } + first_stage="cleanup" fi -# ## IX. CLEANUP -# echo "...Starting cleanup..." >&1 -# #rm reads.sorted.bam reads.sorted.bam.bai -# #rm reads_to_homologs.txt -# echo ":) Done with cleanup. This is the last stage of the pipeline. Exiting!" -# exit +## IX. CLEANUP + echo "...Starting cleanup..." >&1 + #rm reads.sorted.bam reads.sorted.bam.bai + #rm merged${mapq}.txt + echo ":) Done with cleanup. This is the last stage of the pipeline. Exiting!" + exit } From 6633ab84fc7dce3d68a44a31b2aab50f86eb8018 Mon Sep 17 00:00:00 2001 From: dudcha Date: Thu, 20 Jan 2022 15:01:33 -0600 Subject: [PATCH 07/10] description tweak --- CPU/megafy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CPU/megafy.sh b/CPU/megafy.sh index 58526e4..d6a7206 100755 --- a/CPU/megafy.sh +++ b/CPU/megafy.sh @@ -3,7 +3,7 @@ #### Description: Wrapper script to calculate a "mega" hic file and accessibility track from a set of bams. #### Usage: bash ./megafy.sh -c|--chrom-sizes ... . #### Input: list of merged_dedup.bam files corresponding to individual Hi-C experiments. -#### Output: "mega" hic map, "mega" chromatin accessibility bw file. +#### Output: "mega" hic map, "mega" chromatin accessibility bw file, "mega" annotations [latter set to be expanded]. #### Dependencies: Java, Samtools, GNU Parallel, KentUtils, Juicer. #### Written by: OD From 204feb3ae72e6204c24382310829a6e695735f24 Mon Sep 17 00:00:00 2001 From: dudcha Date: Thu, 20 Jan 2022 15:13:44 -0600 Subject: [PATCH 08/10] rename option name for stats inferrence --- CPU/megafy.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/CPU/megafy.sh b/CPU/megafy.sh index d6a7206..7ff38ed 100755 --- a/CPU/megafy.sh +++ b/CPU/megafy.sh @@ -15,7 +15,7 @@ USAGE=" ***************************************************** Optimized mega script for ENCODE DCC hic-pipeline. -USAGE: ./megafy.sh -c|--chrom-sizes [-g|--genome-id genome_id] [-q|--mapq mapq] [-r|--resolutions resolutions_string] [--juicer-dir ] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--shortcut-stats-1 inter.hic_1,...inter.hic_N] [--shortcut-stats-30 inter_30.hic_1,...inter_30.hic_N] [--from-stage ] [--to-stage ] ... +USAGE: ./megafy.sh -c|--chrom-sizes [-g|--genome-id genome_id] [-q|--mapq mapq] [-r|--resolutions resolutions_string] [--juicer-dir ] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [-i|--infer-stats inter_30.hic_1,...inter_30.hic_N] [--from-stage ] [--to-stage ] ... DESCRIPTION: This is an optimized mega.sh script to produce aggregate Hi-C maps and chromatic accessibility tracks from multiple experiments. @@ -42,7 +42,7 @@ ACTUAL WORK: -r|--resolutions [string] Comma-separated resolutions at which to build the hic files. Default: 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10. ---shortcut-stats [inter_30.hic_1,...inter_30.hic_N] +-i|--infer-stats [inter_30.hic_1,...inter_30.hic_N] Comma-separated list of hic files to use to quickly calculate the mega stats file. Make sure the files used were generated at the same mapping quality as expected of the mega-map (default: -q 30). WORKFLOW CONTROL: @@ -135,9 +135,9 @@ while :; do resolutionsToBuildString="-r "$OPTARG shift ;; - --shortcut-stats) OPTARG=$2 - echo "... --shortcut-stats flag was triggered with $OPTARG value." >&1 - shortcut_stats=$OPTARG + -i|--infer-stats) OPTARG=$2 + echo "... -i|--infer-stats flag was triggered with $OPTARG value." >&1 + infer_stats_from_files_str=$OPTARG shift ;; ## WORKFLOW @@ -274,7 +274,7 @@ fi ## I. HIC: ### (a) reads.sorted.bam & reads.sorted.bam.bai -> merged${mapq}.txt -### (b) $shortcut_stats -> inter_${mapq}.txt & inter_${mapq}_hists.m +### (b) $infer_stats_from_files_str -> inter_${mapq}.txt & inter_${mapq}_hists.m ### (c) merged${mapq}.txt, inter_${mapq}.txt & inter_${mapq}_hists.m -> inter_$mapq.hic if [ "$first_stage" == "hic" ]; then @@ -302,12 +302,12 @@ if [ "$first_stage" == "hic" ]; then rm temp.log sort -k2,2 -k6,6 -S 6G --parallel=${threads} "merged"${mapq}".tmp.txt" >> "merged"${mapq}".txt" && rm "merged"${mapq}".tmp.txt" -### (b) recreate stats: $shortcut_stats -> inter_${mapq}.txt & inter_${mapq}_hists.m +### (b) recreate stats: $infer_stats_from_files_str -> inter_${mapq}.txt & inter_${mapq}_hists.m touch "inter_"${mapq}".txt" "inter_"${mapq}"_hists.m" - if [ ! -z ${shortcut_stats} ]; then + if [ ! -z ${infer_stats_from_files_str} ]; then tmpstr="" - IFS=',' read -ra STATS <<< "$shortcut_stats" + IFS=',' read -ra STATS <<< "$infer_stats_from_files_str" for i in "${STATS[@]}"; do [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; } done From a8fb7304fa1f2c4acc9428e33f1e65f90e5b2955 Mon Sep 17 00:00:00 2001 From: dudcha Date: Thu, 20 Jan 2022 16:54:37 -0600 Subject: [PATCH 09/10] cast mapq as int in bam to mnd --- CPU/megafy.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CPU/megafy.sh b/CPU/megafy.sh index 7ff38ed..f381850 100755 --- a/CPU/megafy.sh +++ b/CPU/megafy.sh @@ -15,7 +15,7 @@ USAGE=" ***************************************************** Optimized mega script for ENCODE DCC hic-pipeline. -USAGE: ./megafy.sh -c|--chrom-sizes [-g|--genome-id genome_id] [-q|--mapq mapq] [-r|--resolutions resolutions_string] [--juicer-dir ] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [-i|--infer-stats inter_30.hic_1,...inter_30.hic_N] [--from-stage ] [--to-stage ] ... +USAGE: ./megafy.sh -c|--chrom-sizes --juicer-dir -i|--infer-stats [-g|--genome-id ] [-q|--mapq ] [-r|--resolutions ] [-t|--threads ] [-T|--threads-hic ] [--from-stage ] [--to-stage ] ... DESCRIPTION: This is an optimized mega.sh script to produce aggregate Hi-C maps and chromatic accessibility tracks from multiple experiments. @@ -288,7 +288,7 @@ if [ "$first_stage" == "hic" ]; then export SHELL=$(type -p bash) doit () { mapq=$2 - samtools view -@ 2 -q $mapq reads.sorted.bam $1 | awk -F '\t' -v mapq=$mapq '{ip=0; mp=0; mq=-1; cb=0; chimeric=0; for(i=12;i<=NF;i++){if($i~/^ip:i:/){ip=substr($i,6)}else if ($i~/^mp:i:/){mp=substr($i,6)}else if ($i~/^MQ:i:/){mq=substr($i,6)}else if ($i~/cb:Z:/){cb=i}else if ($i~/SA:Z:/){chimeric=i}}}(mqmp){next}else if (ip==mp){keep[$1]=$3" "ip}else{print 0, $3, ip, 0, 0, $7, mp, 1};next}$7<$3{next}{print 0, $3, ip, 0, 0, $7, mp, 1 > "/dev/stderr"}END{for(rd in keep){n=split(keep[rd], a, " "); print 0, a[1], a[2], 0, 0, a[1], a[2], 1}}' + samtools view -@ 2 -q $mapq reads.sorted.bam $1 | awk -F '\t' -v mapq=$mapq '{ip=0; mp=0; mq=-1; cb=0; chimeric=0; for(i=12;i<=NF;i++){if($i~/^ip:i:/){ip=substr($i,6)+0}else if ($i~/^mp:i:/){mp=substr($i,6)+0}else if ($i~/^MQ:i:/){mq=substr($i,6)+0}else if ($i~/cb:Z:/){cb=i}else if ($i~/SA:Z:/){chimeric=i}}}(mqmp){next}else if (ip==mp){keep[$1]=$3" "ip}else{print 0, $3, ip, 0, 0, $7, mp, 1};next}$7<$3{next}{print 0, $3, ip, 0, 0, $7, mp, 1 > "/dev/stderr"}END{for(rd in keep){n=split(keep[rd], a, " "); print 0, a[1], a[2], 0, 0, a[1], a[2], 1}}' } export -f doit @@ -327,14 +327,14 @@ if [ "$first_stage" == "hic" ]; then threadHicString="" fi - ## resources TBD for actual mega-jobs: + ## resources TBD for actual mega-jobs: @Muhammad should 24 be $theadsHiC or something? export IBM_JAVA_OPTIONS="-Xmx50000m -Xgcthreads24" export _JAVA_OPTIONS="-Xmx50000m -Xms50000m" ## actually run pre. Passing -q does not do anything, for record. "${juicer_dir}"/scripts/juicer_tools pre -n -s inter_${mapq}.txt -g inter_${mapq}_hists.m -q $mapq "$resolutionsToBuildString" "$threadHicString" merged${mapq}.txt inter_${mapq}.hic "$chrom_sizes" "${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic inter_${mapq}.hic - rm -Rf "${tempdirPre}" + rm -Rf "${tempdirPre}" "inter_${mapq}.hic_cat_outputs.sh" ## TODO: check for failure echo ":) Done building mega hic files." >&1 @@ -405,6 +405,7 @@ fi ## IX. CLEANUP echo "...Starting cleanup..." >&1 #rm reads.sorted.bam reads.sorted.bam.bai + #rm inter_${mapq}.txt inter_${mapq}_hists.m #rm merged${mapq}.txt echo ":) Done with cleanup. This is the last stage of the pipeline. Exiting!" exit From 6d76736d9a72e923a2ebc22b18961e230b24266f Mon Sep 17 00:00:00 2001 From: dudcha Date: Fri, 21 Jan 2022 10:04:40 -0600 Subject: [PATCH 10/10] remove 1bp resolution from defaults --- CPU/megafy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CPU/megafy.sh b/CPU/megafy.sh index f381850..847590f 100755 --- a/CPU/megafy.sh +++ b/CPU/megafy.sh @@ -67,7 +67,7 @@ WORKFLOW CONTROL: # defaults: mapq=30 -resolutionsToBuildString="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10,1" +resolutionsToBuildString="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10" genome_id="hg38" # multithreading