-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAPE_Python_Code.py
More file actions
375 lines (310 loc) · 13.7 KB
/
Copy pathAPE_Python_Code.py
File metadata and controls
375 lines (310 loc) · 13.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
"""
Created on Fri Nov 16 12:51:19 2018
@author: Nguyen Thi Ngoc Mai
Student Number: 1746303
"""
#------------- Import necessary package for programming -----------------#
#
from pyteomics import mgf, pepxml, mass #
import numpy as np #
import matplotlib.pyplot as plt #
import pandas as pd #
import os #
import function #
import pylab #
import math #
from statistics import mean , median #
#------------------------------------------------------------------------#
##############################################################################
# ------------- STEP 1: Reading file to get mass and its charge ------------ #
##############################################################################
path = 'E:\\Study\\Uhasselt - Material\\2nd year\\Protein Analysis\\Project\\3800\\'
ms1 = dict()
for root, dirs, files in os.walk("3800"):
for filename in files:
df = pd.read_csv(path+filename,sep = ' ', nrows = 1,skip_blank_lines=True, names = ['m/z','z'])
ms1[df['m/z'][0]] = df['z'][0]
##############################################################################
# --------- STEP 2: Reading file to get mass and intensity from MS1 -------- #
##############################################################################
name = list(ms1.keys())
dataset = dict()
for root, dirs, files in os.walk("3800"):
for i in range(len(files)):
#print(len(files))
df = pd.read_csv(path+files[i], sep = ' ', skiprows = 1,
skip_blank_lines=True, names = ['m/z','intensity'])
dataset[name[i]] = df
# Normalizing intensity of MS2
for i in name:
df = dataset[i]
df['i.n'] = [df['intensity'][j]/sum(df['intensity']) for j in range(len(df))]
#df['r_intens'] = [df['intensity'][j]/max(df['intensity']) for j in range(len(df))]
#df['n_intens'] = [df['intensity'][j]/mean(df['intensity']) for j in range(len(df))]
#df['m_intens'] = [df['intensity'][j]/median(df['intensity']) for j in range(len(df))]
#df['h'] = [df['intensity'][j]/df['m/z'][j] for j in range(len(df))]
#df['norm_i'] = [df['h'][j]/sum(df['h']) for j in range(len(df))]
# -- normalize intensity for observed data
##############################################################################
# -------- STEP 3: Creating database for protein, decoy and peptides --------#
##############################################################################
# ---------------- CREATE PROTEIN DATABASE FROM FASTA FILE ------------------#
path = 'E:\\Study\\Uhasselt - Material\\2nd year\\Protein Analysis\\Project\\'
p_protein = open(path+'studentP.fasta','r')
protein = p_protein.read().splitlines()
protein_lst = function.pro(protein)
protein_db = function.get_db(protein_lst)
# ----------------- CREATE DECOY DATABASE FROM FASTA FILE -------------------#
d_protein = open(path+'studentD.fasta','r')
decoy = d_protein.read().splitlines()
decoy_lst = function.pro(decoy)
decoy_db = function.get_decoydb(decoy_lst)
# ----------------- CREATE PEPTIDE DATABASE FROM FASTA FILE -----------------#
p_peptide = open(path+'studentP_peptides.fasta','r')
peptide = p_peptide.read().splitlines()
peptide_mass_db = function.get_mass_db(peptide)
d_peptide = open(path+'studentD_peptides.fasta','r')
decoy_p = d_peptide.read().splitlines()
decoy_mass_db = function.get_mass_db(decoy_p)
# --------------- CREATE DATABASE FOR PROTEIN AND ITS PEPTIDES----------------#
# 3rd party package to digest whole protein sequence to peptides
from pyteomics import parser
# merge protein and decoy database:
whole_db = dict()
for i in list(protein_db.keys()):
whole_db[i] = protein_db[i]
for i in list(decoy_db.keys()):
whole_db[i] = decoy_db[i]
protein_peptides = dict()
keys = list(whole_db.keys())
for i in keys:
protein_peptides[i] = list(parser.cleave(whole_db[i], parser.expasy_rules['trypsin'],0))
##############################################################################
# -------- STEP 4: Getting potential candidates based on MS1's mass -------- #
##############################################################################
def candidate_ms1(precursor_mass, tolerance):
'''
This function to select potential candidate from current database
of peptides digested from protein and decoy db
- precursor_mass is from MS1 as MH+
- tolerance value here is measured by ppm
- proton: 1.007282 dalton (will be deducted from precursor_mass)
'''
protein = []
decoy = []
mass = precursor_mass - 1.00728
peptide_mass_lst = list(peptide_mass_db.keys())
decoy_mass_lst = list(decoy_mass_db.keys())
for i in peptide_mass_lst:
if abs(i - mass)/i*10**6 <= tolerance: #<= i + tolerance:
protein.append(peptide_mass_db[i])
for j in decoy_mass_lst:
if abs(j - mass)/j*10**6 <= tolerance:
decoy.append(decoy_mass_db[j])
return protein, decoy
def matchPeptide2Protein(peptide):
key = list(whole_db.keys())
l = []
for i in key:
if peptide in protein_peptides[i]:
l.append(i)
return l
candidate=dict() # list of peptide sequences based on precursor mass
tolerance = 100
for i in name:
candidate[i]=candidate_ms1(i,tolerance)[0] + candidate_ms1(i,tolerance)[1]
ms1_candidate = {} # this is just to create dataframe for the output
for i in name:
ms1_candidate[i] = pd.DataFrame([[j, matchPeptide2Protein(j)] for j in candidate[i]], columns = ['peptide', 'access.no'])
#ms1_df[i] = pd.DataFrame(ms1_candidate[i],columns=['peptide', 'access.no'])
##############################################################################
# -------- STEP 5: Getting b,y ions from given sequence and its m/z -------- #
##############################################################################
def bIon_db(sequence,charge):
'''
Creat database for mass of b ions to compare with observed data
'''
b=function.bIon(sequence)
b_db = dict()
for j in range(1, charge):
for i in b:
b_db[float(mass.fast_mass(i,ion_type='b',charge=j))] = i# for j in range(1,charge)]
return b_db
def yIon_db(sequence,charge):
'''
Creat database for mass of y ions to compare with observed data
'''
y=function.yIon(sequence)
y_db = dict()
for j in range(1, charge):
for i in y:
y_db[float(mass.fast_mass(i,ion_type='y',charge=j))] = i# for j in range(1,charge)]
return y_db
def getIons(sequence,charge):
'''
This function return theoretical mass to charge of b and y ions
from MS2.
Based on this, comparing with observed data to list down
potential ions to predict peptide sequence
'''
outcome = []
bions = function.bIon(sequence)
yions = function.yIon(sequence)
for i in bions:
outcome.append(i)
for j in range(1,charge):
outcome.append(float(mass.fast_mass(i,ion_type='b',charge=j)))
for i in yions:
outcome.append(i)
for j in range(1,charge):
outcome.append(float(mass.fast_mass(i,ion_type='y',charge=j)))
return outcome
def getCIDFragmentIons(sequence,charge):
l = getIons(sequence, charge)
d = dict()
out = []
for i in range(int(len(l)/charge)):
out.append(l[i*charge:i*charge+charge])
for i in ('b', 'y'):
d['b'] = pd.DataFrame(out[:int(len(out)/2)])
d['y'] = pd.DataFrame(out[int(len(out)/2):])
#return d
for i in list(d.values()):
print(i)
##### Example for function getCIDFragmentIons:
getCIDFragmentIons('AMLKPWTS',3)
# Results are matrices for b and y ions and their m/z with z up to
# precursor charge - 1
# 0 1 2
#0 A 72.044386 36.525831
#1 AM 203.084876 102.046076
#2 AML 316.168936 158.588106
#3 AMLK 444.263896 222.635586
#4 AMLKP 541.316656 271.161966
#5 AMLKPW 727.395966 364.201621
#6 AMLKPWT 828.443646 414.725461
# 0 1 2
#0 STWPKLM 862.449131 431.728204
#1 STWPKL 731.408641 366.207959
#2 STWPK 618.324581 309.665929
#3 STWP 490.229621 245.618449
#4 STW 393.176861 197.092069
#5 ST 207.097551 104.052414
#6 S 106.049871 53.528574
def candidate_bIon(sequence, charge, ms1_mass, tolerance):
'''
This function to select potential b or y ions from potential candidate
from MS1.
It will return the list of b and y ions based on the mass in observed
data within the tolerance
Tolerance value is measured by da: this project applies 0.1da. User can
used by their own reference
'''
db = bIon_db(sequence,charge)
ion = []
#H = 1.00728
for i in dataset[ms1_mass]['m/z']:
for j in list(db.keys()):
if abs(i-j)*charge <= tolerance:
ion.append(db[j])
return ion
def candidate_yIon(sequence, charge, ms1_mass, tolerance):
'''
This function to select potential b or y ions from potential candidate
from MS1.
It will return the list of b and y ions based on the mass in observed
data within the tolerance
Tolerance value is measured by da: this project applies 0.1da. User can
used by their own reference
'''
db = yIon_db(sequence,charge)
ion = []
for i in dataset[ms1_mass]['m/z']:
for j in list(db.keys()):
if abs(i-j)*charge <= tolerance:
ion.append(db[j])
return ion
##############################################################################
# ------- STEP 7: Scoring potential peptides from MS1 by count peak ------- #
# --- by mass matching of ions and divided by number of theoretical ions --- #
##############################################################################
# Method 1: Counting number of peak matching
def score1(lst, precursor_mass):
d = dict()
for i in lst:
d[i] = len(candidate_bIon(i,ms1[precursor_mass],precursor_mass,0.1))+len(candidate_yIon(i,ms1[precursor_mass],precursor_mass,0.1))
return d
# Method 2: Taking intensity and number of peak matching
# -- Getting the interval and average intensity within interval for observed data
def intensity_bIon(sequence, charge, ms1_mass, tolerance):
'''
This function to select potential b or y ions from potential candidate
from MS1.
It will return the list of b and y ions based on the mass in observed
data within the tolerance
Tolerance value is measured by da: this project applies 0.1da. User can
used by their own reference
'''
db = bIon_db(sequence,charge)
ion = []
s = 0
df = dataset[ms1_mass]
for i in range(len(df['m/z'])):
for j in list(db.keys()):
if abs(df['m/z'][i]-j)*charge <= tolerance:
ion.append([db[j],df['m/z'][i]])
s += df['i.n'][i]
return s
def intensity_yIon(sequence, charge, ms1_mass, tolerance):
'''
This function to select potential b or y ions from potential candidate
from MS1.
It will return the list of b and y ions based on the mass in observed
data within the tolerance
Tolerance value is measured by da: this project applies 0.1da. User can
used by their own reference
'''
db = yIon_db(sequence,charge)
ion = []
s = 0
df = dataset[ms1_mass]
for i in range(len(df['m/z'])):
for j in list(db.keys()):#Ion_db:
if abs(df['m/z'][i]-j)*charge <= tolerance:
ion.append([db[j],df['m/z'][i]])
s += df['i.n'][i]
return s
def score2(lst, precursor_mass):
d = dict()
for i in lst:
C = intensity_bIon(i,ms1[precursor_mass],precursor_mass,0.1)+intensity_yIon(i,ms1[precursor_mass],precursor_mass,0.1)
f = math.exp(len(set(candidate_bIon(i,ms1[precursor_mass],precursor_mass,0.1)))+len(set(candidate_yIon(i,ms1[precursor_mass],precursor_mass,0.1))))
d[i] = math.log10(C*f)
return d
# Getting score for potential peptide candidates by each precursor mass
for i in name: # name is list of precursor mass
print(pd.DataFrame(list(score2(candidate[i],i).values()),list(score2(candidate[i],i).keys()), columns=['Score for mass %s'%i]))
##############################################################################
# ------- STEP 8: Function for assigning confidence on the findings ------- #
##############################################################################
def global_fdr(d):
threshold = 0
decoy = 0
results = []
for i in list(d.keys()):
if d[i] > threshold:
results.append(i)
for j in results:
if j in list(decoy_mass_db.values()):
decoy += 1
return round(decoy/len(results),4)
fdr=dict()
for i in name:
fdr[i] = global_fdr(score2(candidate[i],i))
print(pd.DataFrame(list(fdr.values()),list(fdr.keys()), columns = ['FDR']))
# ------- List of potential peptide candidates in target and decoy db ------- #
target = dict()
decoy = dict()
for i in name:
target[i] = candidate_ms1(i,100)[0]
decoy[i] = candidate_ms1(i,100)[1]