forked from xgrau/recoded-mixture-models
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrecoding_alignments.R
More file actions
92 lines (81 loc) · 2.53 KB
/
recoding_alignments.R
File metadata and controls
92 lines (81 loc) · 2.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# libraries
library("Biostrings")
#args = commandArgs(trailingOnly=TRUE)
args = c("Dataset.fasta")
ali_fn = args[1]
# read alignment
ali = Biostrings::readAAMultipleAlignment(ali_fn)
# recoding dictionaries
recoding_dictionary = list(
"Hp" = list(
"0" = "ACFGILMVW",
"1" = "DEHKNPQRSTY"
),
"SR4" = list(
"A" = "AGNPST",
"C" = "CHWY",
"G" = "DEKQR",
"T" = "FILMV"
),
"Dayhoff6" = list(
"0" = "AGPST",
"1" = "DENQ",
"2" = "HKR",
"3" = "ILMV",
"4" = "FWY",
"5" = "C"
),
"Dayhoff9" = list(
"0" = "DEHNQ",
"1" = "ILMV",
"2" = "FY",
"3" = "AST",
"4" = "KR",
"5" = "G",
"6" = "P",
"7" = "C",
"8" = "W"
),
"SR6" = list(
"0" = "APST",
"1" = "DENG",
"2" = "QKR",
"3" = "MIVL",
"4" = "WC",
"5" = "FYH"
),
"KGB6" = list(
"0" = "AGPS",
"1" = "DENQHKRT",
"2" = "MIL",
"3" = "W",
"4" = "FY",
"5" = "CV"
)
)
# references:
# Hp: N. Lartillot, S. Blanquart, T, Lepage, PhyloBayes 3.3 a Bayesian software for phylogenetic reconstruction and molecular dating using mixture models. http://www.atgc-montpellier.fr/download/binaries/phylobayes/phylobayes3.3.pdf
# Dayhoff6: M.O. Dayhoff, R.M. Schwartz, B.C. Orcutt, A model of evolutionary change in proteins
# Dayhoff9: Alexandra M Hernandez, Joseph F Ryan, Six-State Amino Acid Recoding is not an Effective Strategy to Offset Compositional Heterogeneity and Saturation in Phylogenetic Analyses
# SR6: On reduced amino acid alphabets for phylogenetic inference, Mol. Biol. Evol., 24 (2007), pp. 2139-2150
# KGB6: A new criterion and method for amino acid classification J. Theor. Biol., 228 (2004), pp. 97-106
for (dic in names(recoding_dictionary)) {
# where does the output go?
out_fn = gsub(".fasta", sprintf(".rec%s.fasta", dic), ali_fn)
# apply recoding
ali_r = ali
for (i in 1:length(recoding_dictionary[[dic]])) {
n = names(recoding_dictionary[[dic]])[i]
message(sprintf("%s recoding, change to %s", dic, n))
for (a in strsplit(recoding_dictionary[[dic]][[i]], split = "")[[1]] ) {
ali_r = Biostrings::chartr(old = a, new = n, x = ali_r)
}
}
# encode ambiguous characters as gaps
ali_r = Biostrings::chartr(old = "X", new = "-", x = ali_r)
ali_r = Biostrings::chartr(old = "N", new = "-", x = ali_r)
# save as string set
ali_r = Biostrings::BStringSet(ali_r)
# save as fasta
Biostrings::writeXStringSet(ali_r, filepath = out_fn)
}