-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathpdb2fasta.py
More file actions
executable file
·322 lines (281 loc) · 11.7 KB
/
pdb2fasta.py
File metadata and controls
executable file
·322 lines (281 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
#!/usr/bin/env python
# 2016-04-27 Chengxin Zhang
docstring='''
pdb2fasta.py pdb.pdb > seq.fasta
convert PDB file pdb.pdb to sequence FASTA file seq.fasta
options:
-PERMISSIVE={MSE,ATOM,HETATM} how to treat nonstandatd amino acids
MSE - (default), only ATOM or MSE HETATM in first MODEL is converted
ATOM - only allow ATOM residues
HETATM- All all ATOM & HETATM residues with CA atom, including ligands
-allowX={true,true} whether to allow amino acid "X"
true - (default) allow any amino acid type, including X
false - only allow amino acid that can mapped to 20 standard amino
acids ACDEFGHIKLMNPQRSTVWY
-outfmt={PDB,COFACTOR} how to treat multichain PDB
PDB - convert multiple chains PDB to multiple sequence FASTA
COFACTOR - convert PDB to single sequence FASTA. If there are
multiple chains in pdb, they will be treated as one chain.
-SEQRES={false,true} whether to convert from "SEQRES" entries
false - (default) always convert from "ATOM" entries
true - convert from "SEQRES" entries if present,
otherwise convert from "ATOM" entries.
(For PDB format input only)
-mol={all,protein,rna,dna} which macromolecule type to use
all - use " CA " or " C3'" atoms
protein - use " CA " atoms
rna - use " C3'" atoms
dna - use " C3'" atoms, the same as rna
'''
import sys,os
import shutil
import textwrap
from fixMSE import code_with_modified_residues
import gzip,tarfile
import random
code_standard = {
'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M',
'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K',
'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H',
'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',
#'MSE':'M',
' DA':'a', ' DC':'c', ' DG':'g', ' DT':'t',
' A':'a', ' C':'c', ' G':'g', ' T':'u',
}
def pdbbundle2seq(tarball_name="pdb-bundle.tar.gz",PERMISSIVE="MSE",
outfmt="PDB",allowX=True,mol="all"):
'''convert best effort/minimum PDB bundle to sequence
'''
chain_id_mapping=dict()
tarball_prefix=os.path.basename(tarball_name).split('.')[0]
tar=tarfile.open(tarball_name,'r:gz')
names=tar.getnames()
PDBid=names[-1].split('-')[0]
# parse chain id mapping
fp=tar.extractfile(PDBid+"-chain-id-mapping.txt")
map_txt=fp.read()
fp.close()
names=[] # list of PDB files in tarball
for section in map_txt.split('\n'+PDBid+"-pdb-bundle"):
if not ':' in section:
continue
idx,section=section.split(".pdb:\n")
pdb_bundle_name=PDBid+"-pdb-bundle"+idx+".pdb"
names.append(pdb_bundle_name)
for line in section.splitlines():
New_chain_ID,Original_chain_ID=line.split()
key=pdb_bundle_name.split('.')[0]+':'+New_chain_ID
value=tarball_prefix+':'+Original_chain_ID
chain_id_mapping[key]=value
header_list=[]
sequence_list=[]
for pdb_bundle_name in names:
# parse text in *-pdb-bundle*.pdb
fp=tar.extractfile(pdb_bundle_name)
txt=fp.read()
fp.close()
header_list_tmp,sequence_list_tmp=pdbtxt2seq(
txt,pdb_bundle_name,PERMISSIVE,outfmt,allowX,False,mol)
if outfmt=="PDB":
header_list+=[chain_id_mapping[h] for h \
in header_list_tmp]
sequence_list+=sequence_list_tmp
if outfmt=="COFACTOR":
sequence=''.join([''.join(s.splitlines()) for s in sequence_list])
sequence=textwrap.fill(''.join(sequence),60)
header=tarball_prefix+'\t'+str(len(sequence))
header_list=[header]
sequence_list=[sequence]
return header_list,sequence_list
def pdb2seq(infile="pdb.pdb", PERMISSIVE="MSE", outfmt="PDB",
allowX=True, SEQRES=False,mol="all"):
'''Convert PDB to sequence.
Return two lists, one for headers and the other for sequence.
PERMISSIVE - whether allow non-standard residues
ATOM: Only allow ATOM residues
HETATM: Allow all ATOM & HETATM residues, even if they are ligands
MSE: (default) Disallow any non-standard amino acid apart from MSE
'''
if infile.endswith(".tar.gz"): # best effort/minimum PDB bundle
return pdbbundle2seq(infile,PERMISSIVE,outfmt,allowX,mol)
#elif infile.endswith(".cif") or infile.endswith(".cif.gz"): # PDBx/mmCIF
#return mmCIF2seq(infile,PERMISSIVE,outfmt,allowX)
elif infile.endswith(".gz"):
fp=gzip.open(infile,'rU')
else:
fp=open(infile,'rU')
txt=fp.read()
fp.close()
return pdbtxt2seq(txt,infile,PERMISSIVE,outfmt,allowX,SEQRES,mol)
def mmCIF2seq(infile="pdb.cif", PERMISSIVE="MSE",outfmt="PDB",
allowX=True):
''' Convert mmCIF/PDBx format file "infile" '''
if infile.endswith(".gz"):
fp=gzip.open(infile,'rU')
else:
fp=open(infile,'rU')
txt=fp.read()
fp.close()
aa3to1=code_with_modified_residues
chain_list=[]
chain_dict=dict() # Each chain will be one keu
for line in txt.splitlines():
if not line.startswith("ATOM") or line.startswith("HETATM"):
continue
line=line.split()
if len(line)<26:
continue
group_PDB,atom_num,type_symbol,label_atom_id,label_alt_id, \
label_comp_id,label_asym_id,label_entity_id,label_seq_id, \
pdbx_PDB_ins_code,Cartn_x,Cartn_y,Cartn_z,occupancy, \
B_iso_or_equiv,Cartn_x_esd,Cartn_y_esd,Cartn_z_esd, \
occupancy_esd,B_iso_or_equiv_esd,pdbx_formal_charge, \
auth_seq_id,auth_comp_id,auth_asym_id,auth_atom_id, \
pdbx_PDB_model_num=line
if int(pdbx_PDB_model_num)>1:
break # just parse the first model
if label_atom_id!="CA" or not label_alt_id in ['.','A']: # just CA
continue
if PERMISSIVE=="ATOM" and group_PDB!="ATOM":
continue
elif PERMISSIVE=="MSE" and group_PDB!="ATOM" and label_comp_id!="MSE":
continue
aa=aa3to1[label_comp_id] if label_comp_id in aa3to1 else 'X'
if not allowX and aa=='X':
continue
if not label_asym_id in chain_dict:
chain_dict[label_asym_id]=[]
chain_list.append(label_asym_id)
residue_tuple=(label_seq_id,aa)
if not residue_tuple in chain_dict[label_asym_id]:
chain_dict[label_asym_id].append(residue_tuple)
header_list=[]
sequence_list=[]
PDBID=os.path.basename(infile).split('.')[0]
for chain_id in chain_list:
res_num_list,sequence=zip(*chain_dict[chain_id])
header_list.append(PDBID+':'+chain_id)
sequence_list.append(''.join(sequence))
if outfmt=="COFACTOR":
if len(sequence_list)>1:
sys.stderr.write("WARNING! Multichain PDB %s\n"%infile)
sequence=''.join(sequence_list)
header=PDBID+'\t'+str(len(sequence))
sequence=textwrap.fill(''.join(sequence),60)
header_list=[header]
sequence_list=[sequence]
return header_list,sequence_list
def pdbtxt2seq(txt='',infile='pdb.pdb',PERMISSIVE="MSE",outfmt="PDB",
allowX=True,SEQRES=False,mol="all"):
'''Convert PDB text "txt" to sequence read from PDB file "infile"
Return two lists, one for headers and the other for sequence.
PERMISSIVE - whether allow non-standard residues
ATOM: Only allow ATOM residues
HETATM: Allow all ATOM & HETATM residues, even if they are ligands
MSE: (default) Disallow any non-standard amino acid apart from MSE
'''
txt=txt.split("\nENDMDL")[0] # Only the first model
if not "SEQRES" in txt:
SEQRES=False # use "ATOM" is "SEQRES" is absent
mol=mol.lower()
aa3to1=code_with_modified_residues
chain_list=[]
chain_dict=dict() # Each chain will be one key
for line in txt.splitlines():
line=line+' '*(80-len(line)) # Each line contains at least 80 char
if SEQRES:
if not line[:6]=="SEQRES":
continue
chain_id=line[11].replace(' ','_')
tmp_seq=[]
for residue in line[19:].split():
if len(residue)!=3:
continue # only convert amino acid
aa=aa3to1[residue] if residue in aa3to1 else 'X'
if allowX or aa!='X':
tmp_seq.append((len(tmp_seq),aa))
if tmp_seq:
if not chain_id in chain_dict:
chain_dict[chain_id]=[]
chain_list.append(chain_id)
chain_dict[chain_id]+=tmp_seq
continue
if (mol=="protein" and line[12:16]!=" CA ") or \
(mol in {"rna","dna"} and line[12:16]!=" C3'") or \
(mol=="all" and line[12:16]!=" CA " and line[12:16]!=" C3'"):
continue
if not line[16] in [' ','A']: # remove alternative location
continue
residue=line[17:20] # residue name
if PERMISSIVE == "ATOM" and line[0:6]!="ATOM ":
continue
elif PERMISSIVE == "HETATM" and not line[0:6] in ("ATOM ","HETATM"):
continue
elif PERMISSIVE == "MSE" and (not line[0:6]=="ATOM " and \
not (line[0:6]=="HETATM" and residue=="MSE")):
continue
# underscore for empty chain identifier
chain_id=line[21].replace(' ','_')
res_num=line[22:27] # residue sequence number
aa=aa3to1[residue] if residue in aa3to1 else 'X' # one letter AA name
if not allowX and aa=='X':
continue
residue_tuple=(res_num,aa)
if not chain_id in chain_dict:
chain_dict[chain_id]=[]
chain_list.append(chain_id)
if not residue_tuple in chain_dict[chain_id]:
chain_dict[chain_id].append(residue_tuple)
header_list=[]
sequence_list=[]
PDBID=os.path.basename(infile).split('.')[0]
#PDBID=os.path.basename(infile).upper().split('.')[0]
for chain_id in chain_list:
res_num_list,sequence=zip(*chain_dict[chain_id])
header_list.append(PDBID+':'+chain_id)
sequence_list.append(''.join(sequence))
if outfmt=="COFACTOR":
if len(sequence_list)>1:
sys.stderr.write("WARNING! Multichain PDB %s\n"%infile)
sequence=''.join(sequence_list)
header=PDBID+'\t'+str(len(sequence))
sequence=textwrap.fill(''.join(sequence),60)
header_list=[header]
sequence_list=[sequence]
return header_list,sequence_list
def pdb2fasta(infile="pdb.pdb", PERMISSIVE="MSE", outfmt="PDB",
allowX=True,SEQRES=False, mol="all"):
'''Convert PDB to FASTA'''
header_list,sequence_list=pdb2seq(infile,PERMISSIVE,outfmt,allowX,SEQRES,mol)
fasta_list=['>'+header_list[i]+'\n'+ \
sequence_list[i] for i in range(len(sequence_list))]
return '\n'.join(fasta_list)+'\n'
if __name__=="__main__":
PERMISSIVE="MSE"
outfmt="PDB"
allowX=True
SEQRES=False
mol="all"
argv=[]
for arg in sys.argv[1:]:
if arg.startswith("-outfmt="):
outfmt=arg[len("-outfmt="):].upper()
elif arg.startswith("-PERMISSIVE="):
PERMISSIVE=arg[len("-PERMISSIVE="):].upper()
elif arg.startswith("-allowX="):
allowX=(arg[len("-allowX="):].lower()=="true")
elif arg.startswith("-SEQRES="):
SEQRES=(arg[len("-SEQRES="):].lower()=="true")
elif arg.startswith("-mol="):
mol=arg[len("-mol="):].lower()
elif arg.startswith("-"):
sys.stderr.write("ERROR! Unknown argument %s\n"%arg)
exit()
else:
argv.append(arg)
if len(argv)<1:
sys.stderr.write(docstring)
for pdb in argv:
sys.stdout.write(pdb2fasta(
pdb, PERMISSIVE=PERMISSIVE, outfmt=outfmt, allowX=allowX,
SEQRES=SEQRES, mol=mol))