-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathEG_parseGene2Refseq.py
More file actions
125 lines (89 loc) · 4.55 KB
/
EG_parseGene2Refseq.py
File metadata and controls
125 lines (89 loc) · 4.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Parse all the refseq mappings to Entrez Gene IDs
# into a mapping table
import Config
import sys, string
import MySQLdb
import Database
import gzip
from classes import EntrezGene, Refseq
with Database.db as cursor :
entrezGene = EntrezGene.EntrezGene( Database.db, cursor )
refseq = Refseq.Refseq( Database.db, cursor )
existingEntrezGeneIDs = entrezGene.fetchExistingEntrezGeneIDs( )
refseqMap = refseq.buildRefseqMappingHash( )
organismList = entrezGene.fetchEntrezGeneOrganismMapping( )
cursor.execute( "TRUNCATE TABLE " + Config.DB_NAME + ".gene_refseqs" )
Database.db.commit( )
missingRefseqs = set( )
completedMappings = set( )
insertCount = 0
with gzip.open( Config.EG_GENE2REFSEQ, 'r' ) as file :
for line in file.readlines( ) :
line = line.strip( )
# Ignore Header Line
if "#" == line[0] :
continue
splitLine = line.split( "\t" )
entrezGeneTaxID = int(splitLine[0].strip( ))
sourceID = splitLine[1].strip( )
status = splitLine[2].strip( )
rnaAccessionFull = splitLine[3].strip( )
rnaGI = splitLine[4].strip( )
proteinAccessionFull = splitLine[5].strip( )
proteinGI = splitLine[6].strip( )
if sourceID in existingEntrezGeneIDs :
insertCount = insertCount + 1
currentGeneID = existingEntrezGeneIDs[sourceID]
if "-" != proteinAccessionFull :
proteinAccessionSplit = proteinAccessionFull.split( "." )
proteinAccession = proteinAccessionSplit[0]
proteinVersion = proteinAccessionSplit[1]
if proteinAccession in refseqMap :
refseqID = refseqMap[proteinAccession]
# Test to see if this pairing is already
# added so we don't add the same pairing
# multiple times.
idPair = str(currentGeneID) + "|" + str(refseqID)
if idPair not in completedMappings :
insertCount = insertCount + 1
completedMappings.add( idPair )
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_refseqs VALUES( '0', %s, %s, 'active', NOW( ), %s )", [refseqID, status, currentGeneID] )
if "-" != rnaAccessionFull :
rnaAccessionSplit = rnaAccessionFull.split( "." )
rnaAccession = rnaAccessionSplit[0]
rnaVersion = rnaAccessionSplit[1]
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".refseq_identifiers VALUES ( '0', %s, %s, %s, 'active', NOW( ), %s )", [rnaAccession, 'rna-accession', rnaVersion, refseqID] )
if "-" != rnaGI :
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".refseq_identifiers VALUES ( '0', %s, %s, '0', 'active', NOW( ), %s )", [rnaGI, 'rna-gi', refseqID] )
else :
missingRefseqs.add( str(proteinAccession) + "|" + str(currentGeneID) + "|" + str(entrezGeneTaxID) + "|" + status )
if 0 == (insertCount % Config.DB_COMMIT_COUNT ) :
Database.db.commit( )
Database.db.commit( )
# All ids not in our active set of proteins
# are probably old mapped ids that are now
# discontinued, disabled, etc.
# Load these as external identifiers for the genes
# simply for legacy purposes.
# Any NP_ names we come across, load those into the sequence table
# so we can load them later
print len( missingRefseqs )
for missingRefseq in missingRefseqs :
refseqSplit = missingRefseq.split( "|" )
if "NP" == refseqSplit[0][:2] and int(refseqSplit[2]) in organismList :
cursor.execute( "SELECT refseq_id FROM " + Config.DB_NAME + ".refseq WHERE refseq_accession=%s LIMIT 1", [refseqSplit[0]] )
row = cursor.fetchone( )
refseqID = ""
if None == row :
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".refseq VALUES( '0', %s, '0', '', '0', '', '1', %s, NOW( ), 'active' )", [refseqSplit[0], organismList[int(refseqSplit[2])]] )
refseqID = cursor.lastrowid
else :
cursor.execute( "UPDATE " + Config.DB_NAME + ".refseq SET organism_id=%s, refseq_status='active' WHERE refseq_id=%s", [organismList[int(refseqSplit[2])], str(row[0])] )
refseqID = str(row[0])
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_refseqs VALUES( '0', %s, %s, 'active', NOW( ), %s )", [refseqID, refseqSplit[3], refseqSplit[1]] )
else :
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_externals VALUES( '0', %s, 'REFSEQ_LEGACY', 'active', NOW( ), %s )", [refseqSplit[0], refseqSplit[1]] )
Database.db.commit( )
cursor.execute( "INSERT INTO " + Config.DB_STATS + ".update_tracker VALUES ( '0', 'EG_parseGene2Refseq', NOW( ) )" )
Database.db.commit( )
sys.exit( )