qtran4 · psharma-gif · Oct 14, 2022 · Oct 14, 2022 · Oct 14, 2022 · Oct 14, 2022
diff --git a/Yu_Zhu_Sharma.sh b/Yu_Zhu_Sharma.sh
@@ -0,0 +1,53 @@
+#usage: this script could be run in the bioinformaticsproject folder, bash Yu_Zhu_Sharma.sh 
+
+#make a directory for output file, and if script needs to be run again, run rm -r output before running this script
+mkdir output
+
+#put all mcrA in one file
+for file in ~/Private/Biocomputing2022/bioinformaticsProject/ref_sequences/m*.fasta
+do 
+cat $file >> ~/Private/Biocomputing2022/bioinformaticsProject/output/mcrAall.txt
+done
+
+
+#put all hsp in one file
+for file in ~/Private/Biocomputing2022/bioinformaticsProject/ref_sequences/h*.fasta
+do 
+cat $file >> ~/Private/Biocomputing2022/bioinformaticsProject/output/hspall.txt
+done
+
+#align the mcrA and hsp genes using the muscle tool
+~/Private/Biocomputing2022/tools/muscle -in ~/Private/Biocomputing2022/bioinformaticsProject/output/mcrAall.txt -out ~/Private/Biocomputing2022/bioinformaticsProject/output/alignedmcrA.txt
+~/Private/Biocomputing2022/tools/muscle -in ~/Private/Biocomputing2022/bioinformaticsProject/output/hspall.txt -out ~/Private/Biocomputing2022/bioinformaticsProject/output/alignedhsp.txt
+
+#use hmmbuild tool to build hmm profiles from a sequence alignment for both genes
+~/Private/Biocomputing2022/tools/hmmbuild ~/Private/Biocomputing2022/bioinformaticsProject/output/mcrA.hmm ~/Private/Biocomputing2022/bioinformaticsProject/output/alignedmcrA.txt
+~/Private/Biocomputing2022/tools/hmmbuild ~/Private/Biocomputing2022/bioinformaticsProject/output/hsp.hmm ~/Private/Biocomputing2022/bioinformaticsProject/output/alignedhsp.txt
+
+#hmm search each proteom for mcrA genes and hsp genes, use 0.1 as a threshhold for E value
+cd ~/Private/Biocomputing2022/bioinformaticsProject/proteomes
+for file in *.fasta
+do
+~/Private/Biocomputing2022/tools/hmmsearch -E "0.1" --tblout ~/Private/Biocomputing2022/bioinformaticsProject/output/$file.mcrAout ~/Private/Biocomputing2022/bioinformaticsProject/output/mcrA.hmm $file
+done
+
+for file in *.fasta
+do
+~/Private/Biocomputing2022/tools/hmmsearch -E "0.1" --tblout ~/Private/Biocomputing2022/bioinformaticsProject/output/$file.hspout ~/Private/Biocomputing2022/bioinformaticsProject/output/hsp.hmm $file
+done
+
+#Write output csv with counts of matches for mcrA genes and hsp70 genes
+echo "name,mcrA_match,hsp_match" > ~/Private/Biocomputing2022/bioinformaticsProject/Yu_Zhu_Sharma_final.csv
+cd ~/Private/Biocomputing2022/bioinformaticsProject/proteomes
+for file in *.fasta
+do 
+name=$(echo $file)
+mcrA_match=$(grep -v "#" ~/Private/Biocomputing2022/bioinformaticsProject/output/$file.mcrAout | wc -l)
+hsp_match=$(grep -v "#" ~/Private/Biocomputing2022/bioinformaticsProject/output/$file.hspout | wc -l)
+echo "$name,$mcrA_match,$hsp_match" >> ~/Private/Biocomputing2022/bioinformaticsProject/Yu_Zhu_Sharma_final.csv
+done
+
+sed -i 's/.fasta//g' ~/Private/Biocomputing2022/bioinformaticsProject/Yu_Zhu_Sharma_final.csv 
+
+#sort output csv to find out the most ideal proteomes
+cat ~/Private/Biocomputing2022/bioinformaticsProject/Yu_Zhu_Sharma_final.csv | sed 1d | sort -t, -k1,2nr -k2,3nr
diff --git a/Yu_Zhu_Sharma_final.csv b/Yu_Zhu_Sharma_final.csv
@@ -0,0 +1,51 @@
+name,mcrA_match,hsp_match
+proteome_01,0,4
+proteome_02,0,2
+proteome_03,1,3
+proteome_04,0,4
+proteome_05,1,2
+proteome_06,0,0
+proteome_07,1,2
+proteome_08,0,5
+proteome_09,0,1
+proteome_10,0,3
+proteome_11,0,6
+proteome_12,0,6
+proteome_13,0,3
+proteome_14,0,1
+proteome_15,1,1
+proteome_16,1,1
+proteome_17,0,1
+proteome_18,0,8
+proteome_19,2,1
+proteome_20,0,3
+proteome_21,0,4
+proteome_22,0,8
+proteome_23,2,2
+proteome_24,1,2
+proteome_25,0,5
+proteome_26,0,1
+proteome_27,0,1
+proteome_28,0,0
+proteome_29,1,0
+proteome_30,0,1
+proteome_31,0,7
+proteome_32,0,4
+proteome_33,0,0
+proteome_34,0,1
+proteome_35,0,1
+proteome_36,0,3
+proteome_37,0,1
+proteome_38,1,1
+proteome_39,1,1
+proteome_40,0,2
+proteome_41,0,1
+proteome_42,1,3
+proteome_43,0,3
+proteome_44,1,1
+proteome_45,1,3
+proteome_46,0,2
+proteome_47,0,1
+proteome_48,1,1
+proteome_49,0,3
+proteome_50,1,3
diff --git a/Yu_Zhu_Sharma_final_list.txt b/Yu_Zhu_Sharma_final_list.txt
@@ -0,0 +1,65 @@
+Group members: Zhuoran Yu (zyu3), Xianglu Zhu (xzhu7), Puja Sharma (psharma2)
+
+We chose the E value threshold of 0.1 based on our research on E value for protein. E value less than 0.1 is considered significant. 
+
+Here are the most ideal proteins which have both mcrA genes and hsp70 genes, ranked by the number of mcrA gene first, and then hsp70 genes
+
+proteome_23
+proteome_19
+proteome_03
+proteome_42
+proteome_45
+proteome_50
+proteome_05
+proteome_07
+proteome_24
+proteome_15
+proteome_16
+proteome_38
+proteome_39
+proteome_44
+proteome_48
+
+This proteome has 1 mcrA gene, so it could make methane, however, there is no hsp gene found, therefore, it has low pH resistance.
+
+proteome_29
+
+Here is a list of proteomes which has no mcrA gene, but has one or more hsp70 genes, ranked by the pH resistance. 
+
+proteome_18
+proteome_22
+proteome_31
+proteome_11
+proteome_12
+proteome_08
+proteome_25
+proteome_01
+proteome_04
+proteome_21
+proteome_32
+proteome_10
+proteome_13
+proteome_20
+proteome_36
+proteome_43
+proteome_49
+proteome_02
+proteome_40
+proteome_46
+proteome_09
+proteome_14
+proteome_17
+proteome_26
+proteome_27
+proteome_30
+proteome_34
+proteome_35
+proteome_37
+proteome_41
+proteome_47
+
+These 3 proteomes are not ideal because they have neither mcrA gene or hsp70 gene.
+
+proteome_06
+proteome_28
+proteome_33