-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtransmembrane.py
More file actions
120 lines (110 loc) · 2.29 KB
/
transmembrane.py
File metadata and controls
120 lines (110 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
import gzip
import sys
# Write a program that predicts if a protein is trans-membrane
# Trans-membrane proteins have the following properties
# Signal peptide: https://en.wikipedia.org/wiki/Signal_peptide
# Hydrophobic regions(s): https://en.wikipedia.org/wiki/Transmembrane_protein
# No prolines (alpha helix)
# Hydrophobicity is measued via Kyte-Dolittle
# https://en.wikipedia.org/wiki/Hydrophilicity_plot
# For our purposes:
# Signal peptide is 8 aa long, KD > 2.5, first 30 aa
# Hydrophobic region is 11 aa long, KD > 2.0, after 30 aa
import gzip
import sys
def read_fasta(filename):
name = None
seqs = []
fp = None
if filename == '-':
fp = sys.stdin
elif filename.endswith('.gz'):
fp = gzip.open(filename, 'rt')
else:
fp = open(filename)
for line in fp.readlines():
line = line.rstrip()
if line.startswith('>'):
if len(seqs) > 0:
seq = ''.join(seqs)
yield(name, seq)
name = line[1:]
seqs = []
else:
name = line[1:]
else:
seqs.append(line)
yield(name, ''.join(seqs))
fp.close()
def cal_kd(seq):
kd=0
for nt in seq:
if nt== "I" : kd+=4.5
elif nt == "V" : kd+=4.2
elif nt == "L" : kd+=3.8
elif nt == "F" : kd+=2.8
elif nt == "C" : kd+=2.5
elif nt == "M" : kd+=1.9
elif nt == "A" : kd+=1.8
elif nt == "G" : kd+=-0.4
elif nt == "T" : kd+=-0.7
elif nt == "S" : kd+=-0.8
elif nt == "W" : kd+=-0.9
elif nt == "Y" : kd+=-1.3
elif nt == "P" : kd+=-1.6
elif nt == "H" : kd+=-3.2
elif nt == "E" : kd+=-3.5
elif nt == "Q" : kd+=-3.5
elif nt == "D" : kd+=-3.5
elif nt == "N" : kd+=-3.5
elif nt == "K" : kd+=-3.9
elif nt == "R" : kd+=-4.5
return kd/len(seq)
def hydro_tf(seq,nmer,mean_kd):
for i in range(0,len(seq)-nmer+1,1):
seq_mers = seq[i:i+nmer]
kd = cal_kd(seq_mers)
if kd > mean_kd:
return True
return False
filename = sys.argv[1]
for name, seq in read_fasta(filename):
if hydro_tf(seq[0:30],8,2.5) and hydro_tf(seq[30:len(seq)],11,2) is True:
print(name)
"""
python3 transmembrane.py proteins.fasta.gz
18w
Dtg
Krn
Lac
Mcr
PRY
Pxt
Pzl
QC
Ror
S1P
S2P
Spt
apn
bai
bdl
bou
bug
cue
drd
ft
grk
knk
ksh
m
nac
ort
rk
smo
thw
tsg
waw
zye
"""