-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathEG_updateGene2Refseq.py
More file actions
193 lines (142 loc) · 8.14 KB
/
EG_updateGene2Refseq.py
File metadata and controls
193 lines (142 loc) · 8.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# Parse all the refseq mappings to Entrez Gene IDs
# into a mapping table by a single organism
import Config
import sys, string, argparse
import MySQLdb
import Database
import gzip
from classes import EntrezGene, Refseq
# Process Command Line Input
argParser = argparse.ArgumentParser( description = 'Update all Refseq to Gene mappings from Entrez Gene that are relevant to the organism id passed in via the command line.' )
argGroup = argParser.add_mutually_exclusive_group( required=True )
argGroup.add_argument( '-o', help = 'NCBI Organism ID', type=int, dest = 'organismID', action='store' )
argGroup.add_argument( '-g', dest='genes', nargs = '+', help = 'An Entrez Gene ID List to Update', action='store' )
inputArgs = vars( argParser.parse_args( ) )
isOrganism = False
isGene = False
if None != inputArgs['organismID'] :
isOrganism = True
elif None != inputArgs['genes'] :
isGene = True
with Database.db as cursor :
entrezGene = EntrezGene.EntrezGene( Database.db, cursor )
refseq = Refseq.Refseq( Database.db, cursor )
refseqMap = refseq.buildRefseqMappingHash( )
organismList = entrezGene.fetchEntrezGeneOrganismMapping( )
existingEntrezGeneIDs = { }
organismID = 0
missingRefseqs = set( )
if isOrganism :
if inputArgs['organismID'][0] in organismList :
organismID = organismList[inputArgs['organismID'][0]]
existingEntrezGeneIDs = entrezGene.fetchExistingEntrezGeneIDsByOrganism( organismID )
elif isGene :
for gene in inputArgs['genes'] :
geneID = entrezGene.geneExists( gene )
if geneID :
existingEntrezGeneIDs[gene] = geneID
for entrezGeneID, geneID in existingEntrezGeneIDs.iteritems( ) :
cursor.execute( "UPDATE " + Config.DB_NAME + ".gene_refseqs SET gene_refseq_status='inactive' WHERE gene_id=%s", [geneID] )
Database.db.commit( )
insertCount = 0
with gzip.open( Config.EG_GENE2REFSEQ, 'r' ) as file :
for line in file.readlines( ) :
line = line.strip( )
# Ignore Header Line
if "#" == line[0] :
continue
splitLine = line.split( "\t" )
entrezGeneTaxID = int(splitLine[0].strip( ))
sourceID = splitLine[1].strip( )
status = splitLine[2].strip( )
rnaAccessionFull = splitLine[3].strip( )
rnaGI = splitLine[4].strip( )
proteinAccessionFull = splitLine[5].strip( )
proteinGI = splitLine[6].strip( )
if sourceID in existingEntrezGeneIDs :
insertCount = insertCount + 1
currentGeneID = existingEntrezGeneIDs[sourceID]
if "-" != proteinAccessionFull :
proteinAccessionSplit = proteinAccessionFull.split( "." )
proteinAccession = proteinAccessionSplit[0]
proteinVersion = proteinAccessionSplit[1]
if proteinAccession in refseqMap :
refseqID = refseqMap[proteinAccession]
cursor.execute( "SELECT gene_refseq_id FROM " + Config.DB_NAME + ".gene_refseqs WHERE refseq_id=%s AND gene_id=%s LIMIT 1", [refseqID, currentGeneID] )
row = cursor.fetchone( )
if None == row :
print "INSERTING REFSEQ MAPPING"
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_refseqs VALUES( '0', %s, %s, 'active', NOW( ), %s )", [refseqID, status, currentGeneID] )
else :
print "UPDATING REFSEQ MAPPING"
cursor.execute( "UPDATE " + Config.DB_NAME + ".gene_refseqs SET gene_refseq_status='active', gene_refseq_modified=NOW( ) WHERE gene_refseq_id=%s", [row[0]] )
if "-" != rnaAccessionFull :
rnaAccessionSplit = rnaAccessionFull.split( "." )
rnaAccession = rnaAccessionSplit[0]
rnaVersion = rnaAccessionSplit[1]
cursor.execute( "SELECT refseq_identifier_id FROM " + Config.DB_NAME + ".refseq_identifiers WHERE refseq_id=%s AND refseq_identifier_value=%s LIMIT 1", [refseqID, rnaAccession] )
row = cursor.fetchone( )
if None == row :
print "INSERTING REFSEQ RNA ACCESSION IDENTIFIER"
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".refseq_identifiers VALUES ( '0', %s, %s, %s, 'active', NOW( ), %s )", [rnaAccession, 'rna-accession', rnaVersion, refseqID] )
else :
print "UPDATING REFSEQ RNA ACCESSION IDENTIFIER"
cursor.execute( "UPDATE " + Config.DB_NAME + ".refseq_identifiers SET refseq_identifier_status='active', refseq_identifier_version=%s, refseq_identifier_modified=NOW( ) WHERE refseq_identifier_id=%s", [rnaVersion, row[0]] )
if "-" != rnaGI :
cursor.execute( "SELECT refseq_identifier_id FROM " + Config.DB_NAME + ".refseq_identifiers WHERE refseq_id=%s AND refseq_identifier_value=%s LIMIT 1", [refseqID, rnaGI] )
row = cursor.fetchone( )
if None == row :
print "INSERTING REFSEQ RNA GI IDENTIFIER"
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".refseq_identifiers VALUES ( '0', %s, %s, '0', 'active', NOW( ), %s )", [rnaGI, 'rna-gi', refseqID] )
else :
print "UPDATING REFSEQ RNA GI IDENTIFIER"
cursor.execute( "UPDATE " + Config.DB_NAME + ".refseq_identifiers SET refseq_identifier_status='active', refseq_identifier_version='0', refseq_identifier_modified=NOW( ) WHERE refseq_identifier_id=%s", [row[0]] )
else :
missingRefseqs.add( str(proteinAccession) + "|" + str(currentGeneID) + "|" + str(entrezGeneTaxID) + "|" + status )
if 0 == (insertCount % Config.DB_COMMIT_COUNT ) :
Database.db.commit( )
Database.db.commit( )
# All ids not in our active set of proteins
# are probably old mapped ids that are now
# discontinued, disabled, etc.
# Load these as external identifiers for the genes
# simply for legacy purposes.
# Any NP_ names we come across, load those into the sequence table
# so we can load them later
print len( missingRefseqs )
for missingRefseq in missingRefseqs :
refseqSplit = missingRefseq.split( "|" )
#if "NP" == refseqSplit[0][:2] and int(refseqSplit[2]) in organismList :
if int(refseqSplit[2]) in organismList :
cursor.execute( "SELECT refseq_id FROM " + Config.DB_NAME + ".refseq WHERE refseq_accession=%s LIMIT 1", [refseqSplit[0]] )
row = cursor.fetchone( )
refseqID = ""
if None == row :
print "ADDING REFSEQ"
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".refseq VALUES( '0', %s, '0', '', '0', '', '1', %s, NOW( ), 'active' )", [refseqSplit[0], organismList[int(refseqSplit[2])]] )
refseqID = cursor.lastrowid
else :
print "UPDATING REFSEQ"
cursor.execute( "UPDATE " + Config.DB_NAME + ".refseq SET organism_id=%s, refseq_status='active' WHERE refseq_id=%s", [organismList[int(refseqSplit[2])], str(row[0])] )
refseqID = str(row[0])
cursor.execute( "SELECT gene_refseq_id FROM " + Config.DB_NAME + ".gene_refseqs WHERE refseq_id=%s AND gene_id=%s LIMIT 1", [refseqID, refseqSplit[1]] )
row = cursor.fetchone( )
if None == row :
print "INSERTING REFSEQ MAPPING"
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_refseqs VALUES( '0', %s, %s, 'active', NOW( ), %s )", [refseqID, refseqSplit[3], refseqSplit[1]] )
else :
print "UPDATING REFSEQ MAPPING"
cursor.execute( "UPDATE " + Config.DB_NAME + ".gene_refseqs SET gene_refseq_status='active', gene_refseq_modified=NOW( ) WHERE gene_refseq_id=%s", [row[0]] )
else :
cursor.execute( "SELECT gene_external_id FROM " + Config.DB_NAME + ".gene_externals WHERE gene_external_value=%s AND gene_external_source='REFSEQ_LEGACY' AND gene_id=%s LIMIT 1", [refseqSplit[0], refseqSplit[1]] )
row = cursor.fetchone( )
if None == row :
print "INSERTING GENE EXTERNAL MAPPING"
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_externals VALUES( '0', %s, 'REFSEQ-LEGACY', 'active', NOW( ), %s )", [refseqSplit[0], refseqSplit[1]] )
else :
print "UPDATING GENE EXTERNAL MAPPING"
cursor.execute( "UPDATE " + Config.DB_NAME + ".gene_externals SET gene_external_status='active', gene_external_modified=NOW( ) WHERE gene_external_id=%s", [row[0]] )
Database.db.commit( )
cursor.execute( "INSERT INTO " + Config.DB_STATS + ".update_tracker VALUES ( '0', 'EG_updateGene2Refseq', NOW( ) )" )
Database.db.commit( )
sys.exit( )