-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsnp_pipeline.bash
More file actions
248 lines (219 loc) · 5.22 KB
/
snp_pipeline.bash
File metadata and controls
248 lines (219 loc) · 5.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/bin/bash
# Place your gatech userame in the below export
export NAME="yuewenz"
get_input () {
# Function for doing your getopts
#
# Input: Getopts array
while getopts "a:b:r:eo:f:zvih" option
do
case $option in
a) reads1=$OPTARG;;
b) reads2=$OPTARG;;
r) ref=$OPTARG;;
e) realign=1
echo "preformace read realign file"
;;
o) output=$OPTARG;;
f) millsFile=$OPTARG;;
z) gunzip=1
echo "Output VCF file should be gunzipped (*.vcf.gz)"
;;
v) v=1
echo "Verbose mode"
;;
i) index=1
echo "Index your output BAM"
;;
h) echo "Usage information"
exit 0
;;
esac
done
# Replace
# this
# with
# getopts
# code
}
check_files () {
# Function for checking for presence of input files, reference genome,
# and the output VCF file
#
#reads1="./reads1.fq"
if [ -e "$reads1" ]
then
echo "True"
else
echo "reads1.fq is missing"
exit 1
fi
if [ -e "$reads2" ]
then
echo "True"
else
echo "reads2.fq is missing"
exit 1
fi
if [ -e "$ref" ]
then
echo "True"
else
echo "ref.fa is missing"
exit 1
fi
if [ -e "$millsFile" ]
then
echo "True"
else
echo "millsFile is missing"
exit 1
fi
if [ -e "$output" ]
then
read -r -p "The $output alreay exist, what should you want to do? Overwrite or non-overwrite" answer
if [[ "$answer" =~ Overwrite ]]
then
rm $output
else
echo "It is good to go"
fi
else
echo "There are no exist file or program"
fi
# Input: File locations (string)
# Output: True, if checks pass; False, if checks fail (bool)
# Replace
# this
# with
# code
}
prepare_temp () {
# Preparing your temporary directory
#
#
tmp="$1"
if [ -d "tmp" ]
then
echo "$tmp exist!"
else
mkdir tmp
fi
# Replace
# this
# with
# getopts
# code
}
mapping () {
# Function for the mapping step of the SNP-calling pipeline
#
# Input: File locations (string), Verbose flag (bool)
# Output: File locations (string)
if [[ $v -eq 1 ]]
then
echo "This is for mapping and gain the lane_sorted.bam file for follwing step."
fi
#ref="./ref.fa"
#reads1="./reads1.fq"
#reads2="./reads2.fq"
samtools faidx "$ref"
bwa index "$ref"
bwa mem -R '@RG\tID:foo\tSM:bar\tLB:library1' "$ref" "$reads1" "$reads2" > lane.sam
samtools fixmate -O bam lane.sam lane_fixmate.bam
samtools sort -O bam -o lane_sorted.bam -T ./tmp/lane_temp lane_fixmate.bam
# Replace
# this
# with
# code
}
improvement () {
# Function for improving the number of miscalls
#
# Input: File locations (string)
# Output: File locations (string)
if [[ $v -eq 1 ]]
then
echo "This is improvement. This is for reducing the number of miscalls of INDELs in your data it is helpful to realign your raw gapped alignment with the Broad’s GATK Realigner."
fi
java -jar ./lib/picard.jar CreateSequenceDictionary R=$ref O=ref.dict
#ref="./ref.fa"
#millsFiles="./Mills_and_1000G_gold_standard.indels.hg38.vcf"
if [[ "$index" -eq 1 ]]
then
samtools index lane_sorted.bam
fi
java -Xmx2g -jar ./lib/GenomeAnalysisTK.jar -T RealignerTargetCreator -R "$ref" -I lane_sorted.bam -o lane.intervals --known "$millsFile" &> ./output/yuewenz.log
if [[ "$realign" -eq 1 ]]
then
java -Xmx4g -jar ./lib/GenomeAnalysisTK.jar -T IndelRealigner -R "$ref" -I lane_sorted.bam -targetIntervals lane.intervals -known "$millsFile" -o lane_realigned.bam &>> ./output/yuewenz.log
if [[ "$index" -eq 1 ]]
then
samtools index lane_realigned.bam
fi
fi
#fi
# Replace
# this
# with
# code
}
call_variants () {
# Function to call variants
#
# Input: File locations (string)
# Ouput: None
#ref="./ref.fa"
#output="yuewenz.vcf.gz"
if [ -e "$realign" ]
then
bcftools mpileup -Ou -f "$ref" lane_realigned.bam | bcftools call -vmO z -o ./output/"$output".vcf.gz
else
bcftools mpileup -Ou -f "$ref" lane_sorted.bam | bcftools call -vmO z -o ./output/"$output".vcf.gz
fi
# Replace
# this
# with
#code
}
gun_zip () {
#output="yuewenz.vcf.gz"
if [[ "$gunzip" -eq 1 ]]
then
gunzip -c output/"$output".vcf.gz > ./output/yuewenz.vcf
else
output/"$output".vcf.gz
fi
# Replace
# this
# with
#cod
}
convert () {
if [[ "$gunzip" -eq 1 ]]
then
grep -o '^[^#]*' ./output/yuewenz.vcf | awk '{gsub(/^chr/,""); print}' | cut -f1-5 | awk -v FS="\t" -v OFS="\t" '{ print $1, $2, $2+(length($5)-length($4)), length($5)-length($4); }' | awk '{if ($4!= 0) print >"./output/indels.txt"; else print >"./output/snps.txt"}'
else
gunzip -c ./output/"$output".vcf.gz | grep -o '^[^#]*' | awk '{gsub(/^chr/,""); print}' | cut -f1-5 | awk -v FS="\t" -v OFS="\t" '{ print $1, $2, $2+(length($5)-length($4)), length($5)-length($4); }' | awk '{if ($4!= 0) print >"./output/indels.txt"; else print >"./output/snps.txt"}'
fi
}
main() {
# Function that defines the order in which functions will be called
# You will see this construct and convention in a lot of structured code.
# Add flow control as you see appropriate
get_input "$@"
check_files # Add arguments here
prepare_temp
mapping # Add arguments here
improvement # Add arguments here
call_variants # Add arguments here
gun_zip
convert
}
# Calling the main function
main "$@"
# DO NOT EDIT THE BELOW FUNCTION
bats_test (){
command -v bats
}
# DO NOT EDIT THE ABOVE FUNCTION