-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathEG_parseGene2Uniprot.py
More file actions
75 lines (51 loc) · 2.31 KB
/
EG_parseGene2Uniprot.py
File metadata and controls
75 lines (51 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Parse the Uniprot/Refseq collab file to create
# a mapping table between similar proteins.
import Config
import sys, string
import MySQLdb
import Database
from gzip import open as gopen
from classes import UniprotKB, Refseq
with Database.db as cursor :
cursor.execute( "TRUNCATE TABLE " + Config.DB_NAME + ".protein_mapping" )
Database.db.commit( )
uniprotKB = UniprotKB.UniprotKB( Database.db, cursor )
refseq = Refseq.Refseq( Database.db, cursor )
refseqHash = refseq.buildFullRefseqMappingHash( )
uniprotHash = uniprotKB.buildAccessionHash( )
mapping = set( )
with gopen( Config.EG_REFSEQ2UNIPROT ) as file :
insertCount = 0
for line in file.readlines( ) :
line = line.strip( )
# Skip Blank Lines and header lines
if len( line ) <= 0 or "#" == line[0] :
continue
splitLine = line.split( "\t" )
refseqAcc = splitLine[0].strip( )
uniprotAcc = splitLine[1].strip( )
if refseqAcc in refseqHash and uniprotAcc in uniprotHash :
refseqID = refseqHash[refseqAcc]
uniprotID = uniprotHash[uniprotAcc]
if refseqID + "|" + uniprotID not in mapping :
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".protein_mapping VALUES( '0', %s, %s, 'active', NOW( ) )", [refseqID, uniprotID] )
mapping.add( refseqID + "|" + uniprotID )
insertCount = insertCount + 1
if 0 == (insertCount % Config.DB_COMMIT_COUNT ) :
Database.db.commit( )
Database.db.commit( )
cursor.execute( "SELECT uniprot_external_value, uniprot_id FROM " + Config.DB_NAME + ".uniprot_externals WHERE uniprot_external_source='REFSEQ-PROTEIN-ACCESSION'" )
insertCount = 0
for row in cursor.fetchall( ) :
if row[0] in refseqHash :
refseqID = refseqHash[row[0]]
if refseqID + "|" + str(row[1]) not in mapping :
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".protein_mapping VALUES( '0', %s, %s, 'active', NOW( ) )", [refseqID, str(row[1])] )
mapping.add( refseqID + "|" + str(row[1]) )
insertCount = insertCount + 1
if 0 == (insertCount % Config.DB_COMMIT_COUNT ) :
Database.db.commit( )
Database.db.commit( )
cursor.execute( "INSERT INTO " + Config.DB_STATS + ".update_tracker VALUES ( '0', 'EG_parseGene2Uniprot', NOW( ) )" )
Database.db.commit( )
sys.exit( )